Skip to main content

rust_yaml/scanner/
mod.rs

1//! YAML scanner for tokenization
2
3use crate::{Error, Limits, Position, ResourceTracker, Result, error::ErrorContext};
4
5pub mod indentation;
6pub mod scalar_scanner;
7pub mod state;
8pub mod token_processor;
9pub mod tokens;
10// pub mod optimizations; // Temporarily disabled
11pub use scalar_scanner::ScalarScanner;
12pub use tokens::*;
13// pub use optimizations::*;
14
15/// Trait for YAML scanners that convert character streams to tokens
16pub trait Scanner {
17    /// Check if there are more tokens available
18    fn check_token(&self) -> bool;
19
20    /// Peek at the next token without consuming it
21    fn peek_token(&self) -> Result<Option<&Token>>;
22
23    /// Get the next token, consuming it
24    fn get_token(&mut self) -> Result<Option<Token>>;
25
26    /// Reset the scanner state
27    fn reset(&mut self);
28
29    /// Get the current position in the input
30    fn position(&self) -> Position;
31
32    /// Get the input text for error reporting
33    fn input(&self) -> &str;
34}
35
36/// Block-scalar chomping mode per YAML 1.2 §8.1.1.2.
37///
38/// - `Strip` (`-`): drop the final line break and trailing empty lines.
39/// - `Clip` (default): keep exactly one final line break, drop trailing empty lines.
40/// - `Keep` (`+`): preserve the final line break and all trailing empty lines.
41#[derive(Debug, Clone, Copy, PartialEq, Eq)]
42enum ChompingMode {
43    Strip,
44    Clip,
45    Keep,
46}
47
48/// Apply chomping mode to a block-scalar tail.
49///
50/// The collectors emit a `\n` for every line (content or blank). This helper
51/// trims that tail according to spec §8.1.1.2:
52///
53/// - **Strip:** remove every trailing `\n`.
54/// - **Clip:** keep exactly one trailing `\n` if content exists; drop the rest.
55///   Empty input stays empty.
56/// - **Keep:** preserve everything.
57fn apply_chomping(mut s: String, mode: ChompingMode) -> String {
58    match mode {
59        ChompingMode::Keep => s,
60        ChompingMode::Strip => {
61            while s.ends_with('\n') {
62                s.pop();
63            }
64            s
65        }
66        ChompingMode::Clip => {
67            // Strip trailing newlines. If anything remains, restore one.
68            // §8.1.1.2: clip keeps the final line break only when the
69            // scalar has actual content (yaml-test-suite K858: an empty
70            // clip scalar `>` is `""`, not `"\n"`).
71            while s.ends_with('\n') {
72                s.pop();
73            }
74            if !s.is_empty() {
75                s.push('\n');
76            }
77            s
78        }
79    }
80}
81
82/// A basic scanner implementation for YAML tokenization
83#[derive(Debug)]
84#[allow(dead_code)]
85pub struct BasicScanner {
86    input: String,
87    position: Position,
88    current_char: Option<char>,
89    tokens: Vec<Token>,
90    token_index: usize,
91    done: bool,
92    indent_stack: Vec<usize>,
93    current_indent: usize,
94    allow_simple_key: bool,
95    simple_key_allowed: bool,
96    flow_level: usize,
97    preserve_comments: bool,
98    // Indentation style detection
99    detected_indent_style: Option<crate::value::IndentStyle>,
100    indent_samples: Vec<(usize, bool)>, // (size, is_tabs)
101    previous_indent_level: usize,       // Track the previous indentation for style detection
102    // Performance optimizations
103    buffer: String,                   // Reusable string buffer for token values
104    char_cache: Vec<char>,            // Cached characters for faster access
105    char_indices: Vec<(usize, char)>, // Cached character indices for O(1) lookups
106    current_char_index: usize,        // Current index in char_cache
107    profiler: Option<crate::profiling::YamlProfiler>, // Optional profiling
108    // Error tracking
109    scanning_error: Option<Error>, // Store scanning errors for later retrieval
110    // Resource tracking
111    limits: Limits,
112    resource_tracker: ResourceTracker,
113    // Track inline nested sequences that need closing
114    inline_sequence_depth: usize,
115    // Track compact-notation sequences (where `-` is at the same indent as
116    // the parent mapping keys). These are NOT on indent_stack, so we need
117    // separate tracking to know when to emit BlockEnd for them.
118    compact_sequence_indents: Vec<usize>,
119    // Parallel to indent_stack: true when the entry was pushed by a block
120    // sequence, false when by a mapping. Lets us distinguish "continuing a
121    // regular sequence" from "starting a compact sequence at same indent".
122    indent_is_sequence: Vec<bool>,
123}
124
125impl BasicScanner {
126    /// Create a new scanner from input string
127    pub fn new(input: String) -> Self {
128        Self::with_limits(input, Limits::default())
129    }
130
131    /// Create a new scanner with custom resource limits
132    pub fn with_limits(input: String, limits: Limits) -> Self {
133        let char_cache: Vec<char> = input.chars().collect();
134        let char_indices: Vec<(usize, char)> = input.char_indices().collect();
135        let current_char = char_cache.first().copied();
136
137        // Track document size for resource limits
138        let mut resource_tracker = ResourceTracker::new();
139        if let Err(e) = resource_tracker.add_bytes(&limits, input.len()) {
140            // If the input is too large, create scanner with error state
141            return Self {
142                current_char: None,
143                input,
144                position: Position::start(),
145                tokens: Vec::new(),
146                token_index: 0,
147                done: true,
148                indent_stack: vec![0],
149                current_indent: 0,
150                allow_simple_key: false,
151                simple_key_allowed: false,
152                flow_level: 0,
153                preserve_comments: false,
154                detected_indent_style: None,
155                indent_samples: Vec::new(),
156                previous_indent_level: 0,
157                buffer: String::new(),
158                char_cache: Vec::new(),
159                char_indices: Vec::new(),
160                current_char_index: 0,
161                profiler: None,
162                scanning_error: Some(e),
163                limits,
164                resource_tracker,
165                inline_sequence_depth: 0,
166                compact_sequence_indents: Vec::new(),
167                indent_is_sequence: vec![false],
168            };
169        }
170
171        Self {
172            current_char,
173            input,
174            position: Position::start(),
175            tokens: Vec::new(),
176            token_index: 0,
177            done: false,
178            indent_stack: vec![0], // Always start with base indentation
179            current_indent: 0,
180            allow_simple_key: true,
181            simple_key_allowed: true,
182            flow_level: 0,
183            preserve_comments: false,
184            detected_indent_style: None,
185            indent_samples: Vec::new(),
186            previous_indent_level: 0,
187            buffer: String::with_capacity(64), // Pre-allocate buffer
188            char_cache,
189            char_indices,
190            current_char_index: 0,
191            profiler: std::env::var("RUST_YAML_PROFILE")
192                .ok()
193                .map(|_| crate::profiling::YamlProfiler::new()),
194            scanning_error: None,
195            limits,
196            resource_tracker,
197            inline_sequence_depth: 0,
198            compact_sequence_indents: Vec::new(),
199            indent_is_sequence: vec![false],
200        }
201    }
202
203    /// Create a new scanner with eager token scanning (for compatibility)
204    pub fn new_eager(input: String) -> Self {
205        Self::new_eager_with_limits(input, Limits::default())
206    }
207
208    /// Create a new scanner with eager token scanning and custom limits
209    pub fn new_eager_with_limits(input: String, limits: Limits) -> Self {
210        let mut scanner = Self::with_limits(input, limits);
211        // Store any scanning errors for later retrieval
212        if let Err(error) = scanner.scan_all_tokens() {
213            scanner.scanning_error = Some(error);
214        }
215        scanner
216    }
217
218    /// Create a new scanner with comment preservation enabled
219    pub fn new_with_comments(input: String) -> Self {
220        let mut scanner = Self::new(input);
221        scanner.preserve_comments = true;
222        scanner
223    }
224
225    /// Create a new scanner with comments and custom limits
226    pub fn new_with_comments_and_limits(input: String, limits: Limits) -> Self {
227        let mut scanner = Self::with_limits(input, limits);
228        scanner.preserve_comments = true;
229        scanner
230    }
231
232    /// Create a new scanner with eager scanning and comment preservation
233    pub fn new_eager_with_comments(input: String) -> Self {
234        let mut scanner = Self::new_with_comments(input);
235        // Mirror `new_eager_with_limits`: record scanning errors instead
236        // of discarding them (#19). Previously this used
237        // `unwrap_or(())`, silently truncating the token stream and
238        // returning a scanner whose `has_scanning_error()` reported
239        // false — silent data loss for comment-preserving callers.
240        if let Err(error) = scanner.scan_all_tokens() {
241            scanner.scanning_error = Some(error);
242        }
243        scanner
244    }
245
246    /// Get the detected indentation style from the document
247    pub const fn detected_indent_style(&self) -> Option<&crate::value::IndentStyle> {
248        self.detected_indent_style.as_ref()
249    }
250
251    /// Check if there was a scanning error
252    pub const fn has_scanning_error(&self) -> bool {
253        self.scanning_error.is_some()
254    }
255
256    /// Get the scanning error if any
257    #[allow(clippy::missing_const_for_fn)]
258    pub fn take_scanning_error(&mut self) -> Option<Error> {
259        self.scanning_error.take()
260    }
261
262    /// Advance to the next character
263    fn advance(&mut self) -> Option<char> {
264        if let Some(ch) = self.current_char {
265            self.position = self.position.advance(ch);
266            self.current_char_index += 1;
267
268            if self.current_char_index < self.char_cache.len() {
269                self.current_char = Some(self.char_cache[self.current_char_index]);
270            } else {
271                self.current_char = None;
272            }
273        }
274
275        self.current_char
276    }
277
278    /// Skip whitespace characters (excluding newlines)
279    fn skip_whitespace(&mut self) {
280        while let Some(ch) = self.current_char {
281            if ch == ' ' || ch == '\t' {
282                self.advance();
283            } else {
284                break;
285            }
286        }
287    }
288
289    /// Handle indentation and produce block tokens if necessary
290    fn handle_indentation(&mut self) -> Result<()> {
291        // In flow context: if there is a non-trivial enclosing block
292        // (indent_stack has more than the implicit root level), each
293        // continuation line that has content must be indented MORE than
294        // that enclosing block's indent. \`flow: [a,\\nb,c]\` with \`b\`
295        // at col 1 violates this rule because the block mapping enclosing
296        // \`flow:\` sits at indent 0 (yaml-test-suite 9C9N).
297        //
298        // Top-level flow (no enclosing block; indent_stack is just \[0\])
299        // is exempt — `[a,\\nb]` is fine there because the flow content
300        // isn't nested inside any block (yaml-test-suite 4ZYM).
301        if self.flow_level > 0 {
302            if self.indent_stack.len() > 1 || !self.compact_sequence_indents.is_empty() {
303                let mut probe = 0usize;
304                let mut i = self.current_char_index;
305                while i < self.char_cache.len() {
306                    match self.char_cache[i] {
307                        ' ' => {
308                            probe += 1;
309                            i += 1;
310                        }
311                        '\t' => i += 1,
312                        _ => break,
313                    }
314                }
315                let has_content = self
316                    .char_cache
317                    .get(i)
318                    .map_or(false, |c| !matches!(c, '\n' | '\r'));
319                // A line that begins with the matching flow closer
320                // (\`]\` / \`}\`) is allowed at the parent indent — it
321                // closes the flow collection, not adds content
322                // (yaml-test-suite NKF9 trailing-line \`}\` at col 1).
323                let is_closer = matches!(self.char_cache.get(i).copied(), Some(']' | '}'));
324                if has_content && !is_closer {
325                    let parent_indent = self.indent_stack.last().copied().unwrap_or(0);
326                    if probe <= parent_indent {
327                        return Err(Error::scan(
328                            self.position,
329                            "Flow content line is not indented enough".to_string(),
330                        ));
331                    }
332                }
333            }
334            return Ok(());
335        }
336
337        let line_start_pos = self.position;
338        let mut indent = 0;
339        let mut has_tabs = false;
340        let mut has_spaces = false;
341        let _indent_start_pos = self.position;
342
343        // Count indentation and detect style
344        while let Some(ch) = self.current_char {
345            if ch == ' ' {
346                indent += 1;
347                has_spaces = true;
348                self.advance();
349            } else if ch == '\t' {
350                indent += 8; // Tab counts as 8 spaces for indentation calculation
351                has_tabs = true;
352                self.advance();
353            } else {
354                break;
355            }
356        }
357
358        // Analyze indentation pattern for style detection
359        // Only analyze if there's actual content after the indentation (not just whitespace)
360        if indent > 0
361            && self.current_char.is_some()
362            && !matches!(self.current_char, Some('\n' | '\r'))
363        {
364            self.analyze_indentation_pattern(indent, has_tabs, has_spaces)?;
365        }
366
367        // YAML 1.2 §6.1 does NOT require all indents to be multiples
368        // of a single "indent width". Siblings must share a column;
369        // children must indent further; but any positive amount works
370        // (e.g. `key:\n  child:\n   grandchild:` with widths 2, 1
371        // is legal). The earlier strict-multiple-of-N check rejected
372        // valid spec fixtures like 6HB6, 8G76, A2M4, P94K, Q9WF,
373        // UGM3. We rely on the indent_stack-driven open/close logic
374        // (and the per-block "more than parent" rule enforced
375        // elsewhere) to catch genuine mis-indentation.
376
377        // Update previous indentation level for future comparisons
378        if indent > 0 {
379            self.previous_indent_level = indent;
380        }
381
382        // Update current indentation level
383        self.current_indent = indent;
384
385        // Close compact-notation sequences whose scope ends at this line.
386        // A compact sequence (where `-` shares the indent of the parent
387        // mapping keys) ends when the next content line at that indent is
388        // NOT a block entry (`- `).  We must emit the sequence's BlockEnd
389        // BEFORE popping the indent_stack so that the nesting order is
390        // correct (sequence closes before its parent mapping).
391        let has_content =
392            self.current_char.is_some() && !matches!(self.current_char, Some('\n' | '\r' | '#'));
393        if has_content {
394            let is_block_entry = self.current_char == Some('-')
395                && self.peek_char(1).map_or(true, |c| c.is_whitespace());
396            while let Some(&seq_indent) = self.compact_sequence_indents.last() {
397                if indent < seq_indent || (indent == seq_indent && !is_block_entry) {
398                    self.compact_sequence_indents.pop();
399                    self.tokens
400                        .push(Token::simple(TokenType::BlockEnd, line_start_pos));
401                } else {
402                    break;
403                }
404            }
405        }
406
407        // Check if we need to emit block end tokens for decreased indentation
408        let pre_pop_top = self.indent_stack.last().copied().unwrap_or(0);
409        while let Some(&last_indent) = self.indent_stack.last() {
410            if indent < last_indent && last_indent > 0 {
411                self.indent_stack.pop();
412                self.indent_is_sequence.pop();
413                self.tokens
414                    .push(Token::simple(TokenType::BlockEnd, line_start_pos));
415            } else {
416                break;
417            }
418        }
419
420        // §6.1: after a dedent, the new line's indent must match some
421        // existing container level — keys/items at a sibling level
422        // must share a column. Landing at a column that is between
423        // two stack levels (e.g. parent at 0, just-closed at 3, new
424        // line at 1) is invalid because no open mapping/sequence sits
425        // at indent 1 (yaml-test-suite DMG6, N4JP).
426        //
427        // The check applies only when:
428        //   * we actually dedented (pre-pop top was deeper than now),
429        //   * the new line has content (the next char is not blank /
430        //     newline / EOF / comment),
431        //   * indent doesn't match the new top.
432        if pre_pop_top > 0
433            && pre_pop_top > self.indent_stack.last().copied().unwrap_or(0)
434            && self
435                .current_char
436                .map_or(false, |c| !matches!(c, '\n' | '\r' | '#'))
437            && indent != self.indent_stack.last().copied().unwrap_or(0)
438        {
439            // Allow if indent is a valid deeper level — e.g.
440            // sibling at depth then deeper child — but for the
441            // dedent path indent must equal a known stack level.
442            return Err(Error::scan(
443                self.position,
444                format!(
445                    "Indentation {indent} doesn't match any open container (expected {} or deeper)",
446                    self.indent_stack.last().copied().unwrap_or(0)
447                ),
448            ));
449        }
450
451        Ok(())
452    }
453
454    /// Analyze indentation pattern to detect the document's indentation style
455    fn analyze_indentation_pattern(
456        &mut self,
457        current_indent: usize,
458        has_tabs: bool,
459        has_spaces: bool,
460    ) -> Result<()> {
461        // Prevent mixed indentation (tabs + spaces on same line).
462        // Carve-out: a tab AFTER one or more spaces and BEFORE
463        // value-position content (not a key) is content-area
464        // whitespace, not indentation. \`foo:\\n \\tbar\` — the 1
465        // space is indent, the tab is a separator before \`bar\`
466        // which is the value of \`foo:\` (yaml-test-suite DK95/00).
467        if has_tabs && has_spaces {
468            // Peek ahead: if the content after the tab+spaces area
469            // contains a key marker (`: ` or `:`+EOL), treat as
470            // indentation (invalid). Otherwise it's a value line.
471            let looks_like_key = self.line_after_indent_is_implicit_key();
472            if looks_like_key {
473                let context =
474                    crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
475                        .with_suggestion(
476                            "Use either tabs OR spaces for indentation, not both".to_string(),
477                        );
478                return Err(Error::invalid_character_with_context(
479                    self.position,
480                    '\t',
481                    "mixed indentation",
482                    context,
483                ));
484            }
485        }
486        // §6.1: indentation must be space characters only. Pure-tab
487        // indentation (\`\\tkey: value\`) is invalid (yaml-test-suite
488        // 4EJS). Two carve-outs:
489        //   * The mixed case is caught by the earlier branch.
490        //   * Tabs before a flow-collection opener (\`\\t[\`, \`\\t{\`)
491        //     at the root are not "block indentation" — there's no
492        //     enclosing block — and yaml-test-suite 6CA3 / Q5MG accept
493        //     them.
494        if has_tabs && !has_spaces && !matches!(self.current_char, Some('[' | '{')) {
495            let context = crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
496                .with_suggestion("Use space characters for indentation".to_string());
497            return Err(Error::invalid_character_with_context(
498                self.position,
499                '\t',
500                "indentation",
501                context,
502            ));
503        }
504
505        // If we detected tabs, check for mixed indentation across lines
506        if has_tabs {
507            match self.detected_indent_style {
508                None => {
509                    // First time detecting indentation style - set to tabs
510                    self.detected_indent_style = Some(crate::value::IndentStyle::Tabs);
511                }
512                Some(crate::value::IndentStyle::Spaces(_)) => {
513                    // Previously detected spaces, now seeing tabs - mixed indentation error
514                    let context =
515                        crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
516                            .with_suggestion(
517                                "Use consistent indentation style throughout the document"
518                                    .to_string(),
519                            );
520                    return Err(Error::invalid_character_with_context(
521                        self.position,
522                        '\t',
523                        "mixed indentation",
524                        context,
525                    ));
526                }
527                Some(crate::value::IndentStyle::Tabs) => {
528                    // Already using tabs - this is consistent
529                }
530            }
531            return Ok(());
532        }
533
534        // For spaces, check for mixed indentation across lines first
535        if has_spaces {
536            // Check if we previously detected tabs
537            if matches!(
538                self.detected_indent_style,
539                Some(crate::value::IndentStyle::Tabs)
540            ) {
541                let context =
542                    crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
543                        .with_suggestion(
544                            "Use consistent indentation style throughout the document".to_string(),
545                        );
546                return Err(Error::invalid_character_with_context(
547                    self.position,
548                    ' ',
549                    "mixed indentation",
550                    context,
551                ));
552            }
553
554            // Calculate the indentation level difference
555            if current_indent > self.previous_indent_level {
556                let indent_diff = current_indent - self.previous_indent_level;
557
558                // Store this sample for analysis (but only meaningful differences)
559                if indent_diff > 0 && indent_diff <= 8 {
560                    // Reasonable indentation range
561                    self.indent_samples.push((indent_diff, false));
562
563                    // Try to determine consistent indentation width
564                    if self.detected_indent_style.is_none() {
565                        self.detect_space_indentation_width();
566                    }
567                }
568            }
569
570            // YAML 1.2 §6.1 does NOT require all indents to be multiples
571            // of a single "indent width". Sibling lines must share a
572            // column and children must indent deeper than parents, but
573            // any positive amount works. The "multiple of N" check
574            // rejected valid spec fixtures (6HB6, M5C3, P94K, Q9WF,
575            // RZP5, UGM3, XW4D, A2M4); we rely on the indent_stack
576            // open/close logic for genuine mis-indentation. The detected
577            // style is still recorded for later style-preservation use
578            // (e.g. emitter), it just no longer drives validation.
579            // self.validate_indentation_consistency(current_indent)?;
580        }
581
582        Ok(())
583    }
584
585    /// Detect the consistent space indentation width from samples
586    fn detect_space_indentation_width(&mut self) {
587        if self.indent_samples.is_empty() {
588            return; // Need at least 1 sample
589        }
590
591        // Find the most common indentation width
592        let mut width_counts = std::collections::HashMap::new();
593
594        for &(width, is_tabs) in &self.indent_samples {
595            if !is_tabs && width > 0 {
596                *width_counts.entry(width).or_insert(0) += 1;
597            }
598        }
599
600        // Find the most frequent width - be more aggressive and detect early
601        if let Some((&most_common_width, &_count)) =
602            width_counts.iter().max_by_key(|&(_, count)| count)
603        {
604            // Set on first consistent sample to enable stricter validation
605            self.detected_indent_style = Some(crate::value::IndentStyle::Spaces(most_common_width));
606        }
607    }
608
609    /// Check if the given indentation level is valid based on current context
610    #[allow(clippy::missing_const_for_fn)] // Cannot be const due to self.detected_indent_style access
611    fn is_valid_indentation_level(&self, indent: usize) -> bool {
612        // For now, allow any indentation that could represent valid nesting
613        // In the future, this could be made more strict by checking against
614        // the current indent_stack to ensure proper nesting
615        if let Some(crate::value::IndentStyle::Spaces(width)) = self.detected_indent_style {
616            // Must be a multiple of the detected width
617            indent % width == 0
618        } else {
619            // If no style detected yet, allow any indentation
620            true
621        }
622    }
623
624    /// Validate that current indentation is consistent with detected style
625    fn validate_indentation_consistency(&self, current_indent: usize) -> Result<()> {
626        if let Some(crate::value::IndentStyle::Spaces(width)) = self.detected_indent_style {
627            // Check if current indentation is a multiple of the detected width
628            if current_indent > 0 && current_indent % width != 0 {
629                let lower_level = (current_indent / width) * width;
630                let higher_level = lower_level + width;
631                let suggestion = format!(
632                    "Expected indentation to be a multiple of {} spaces. Use {} or {} spaces instead of {}",
633                    width, lower_level, higher_level, current_indent
634                );
635                let context =
636                    crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
637                        .with_suggestion(suggestion);
638                return Err(Error::indentation_with_context(
639                    self.position,
640                    (current_indent / width) * width, // expected (nearest valid level)
641                    current_indent,                   // found
642                    context,
643                ));
644            }
645        }
646        Ok(())
647    }
648
649    /// Check if current position starts a plain scalar
650    fn is_plain_scalar_start(&self) -> bool {
651        self.current_char.map_or(false, |ch| match ch {
652            // Pure indicators — never start a plain scalar.
653            ',' | '[' | ']' | '{' | '}' | '#' | '&' | '*' | '!' | '|' | '>' | '\'' | '"' | '%'
654            | '@' | '`' => false,
655            // YAML 1.2 §7.3.3: `?`, `:`, `-` may start a plain scalar when
656            // the next character is non-whitespace (and, in flow context,
657            // not a flow indicator). Otherwise they act as indicators
658            // (complex-key marker / value separator / block-entry marker).
659            '?' | ':' | '-' => match self.peek_char(1) {
660                None => false,
661                Some(c) if c.is_whitespace() => false,
662                Some(c) if self.flow_level > 0 && ",[]{}".contains(c) => false,
663                Some(_) => true,
664            },
665            _ => !ch.is_whitespace(),
666        })
667    }
668
669    /// Check if the value is a YAML boolean
670    fn is_yaml_bool(value: &str) -> bool {
671        matches!(
672            value,
673            "true"
674                | "false"
675                | "True"
676                | "False"
677                | "TRUE"
678                | "FALSE"
679                | "yes"
680                | "no"
681                | "Yes"
682                | "No"
683                | "YES"
684                | "NO"
685                | "on"
686                | "off"
687                | "On"
688                | "Off"
689                | "ON"
690                | "OFF"
691        )
692    }
693
694    /// Check if the value is a YAML null
695    fn is_yaml_null(value: &str) -> bool {
696        matches!(value, "null" | "Null" | "NULL" | "~" | "")
697    }
698
699    /// Normalize a scalar value based on YAML rules.
700    ///
701    /// The scanner preserves the original text of plain scalars. Type
702    /// resolution (including version-aware bool/null mapping) happens in
703    /// the composer (see `crate::resolver::resolve_plain_scalar`). This
704    /// preserves enough information for the composer to apply the
705    /// YAML 1.1 vs 1.2 distinction and for round-trip emitters to
706    /// recover the original spelling.
707    fn normalize_scalar(value: String) -> String {
708        value
709    }
710
711    /// Scan a number token
712    fn scan_number(&mut self) -> Result<Token> {
713        let start_pos = self.position;
714        let mut value = String::new();
715
716        // Handle negative numbers
717        if self.current_char == Some('-') {
718            value.push('-');
719            self.advance();
720        }
721
722        // Scan digits
723        while let Some(ch) = self.current_char {
724            if ch.is_ascii_digit() {
725                value.push(ch);
726                self.advance();
727            } else if ch == '.' {
728                value.push(ch);
729                self.advance();
730                // Scan fractional part
731                while let Some(ch) = self.current_char {
732                    if ch.is_ascii_digit() {
733                        value.push(ch);
734                        self.advance();
735                    } else {
736                        break;
737                    }
738                }
739                break;
740            } else {
741                break;
742            }
743        }
744
745        Ok(Token::new(
746            TokenType::Scalar(value, tokens::QuoteStyle::Plain),
747            start_pos,
748            self.position,
749        ))
750    }
751
752    /// Scan a plain scalar (unquoted string)
753    fn scan_plain_scalar(&mut self) -> Result<Token> {
754        let start_pos = self.position;
755        let start_col = start_pos.column;
756        let mut value = String::new();
757        let mut multi_line = false;
758
759        loop {
760            // Scan content on the current line until we hit a stop condition.
761            while let Some(ch) = self.current_char {
762                if self.flow_level == 0 {
763                    match ch {
764                        '\n' | '\r' => break,
765                        ':' if self.peek_char(1).map_or(true, |c| c.is_whitespace()) => break,
766                        '#' if value.is_empty()
767                            || self.peek_char(-1).map_or(false, |c| c.is_whitespace()) =>
768                        {
769                            break;
770                        }
771                        _ => {}
772                    }
773                } else {
774                    match ch {
775                        // Same line-break handling as block context: stop
776                        // collecting raw content at `\n`/`\r`, then let the
777                        // outer fold logic decide whether the next line
778                        // continues this scalar (yaml-test-suite 8KB6,
779                        // 8UDB, 9BXH).
780                        '\n' | '\r' => break,
781                        ',' | '[' | ']' | '{' | '}' => break,
782                        // In flow context, `:` is a key-value separator
783                        // when followed by whitespace OR any flow indicator
784                        // (`,`, `[`, `]`, `{`, `}`). Tracked by yaml-test-
785                        // suite FRK4 (`{ ? foo :, ... }`).
786                        ':' if self
787                            .peek_char(1)
788                            .map_or(true, |c| c.is_whitespace() || ",[]{}".contains(c)) =>
789                        {
790                            break;
791                        }
792                        '#' if value.is_empty()
793                            || self.peek_char(-1).map_or(false, |c| c.is_whitespace()) =>
794                        {
795                            break;
796                        }
797                        _ => {}
798                    }
799                }
800                value.push(ch);
801                self.advance();
802            }
803
804            // If we didn't stop at a newline, this scalar is complete.
805            if !matches!(self.current_char, Some('\n' | '\r')) {
806                break;
807            }
808
809            // Per §6.5 line folding, trailing whitespace on the line is
810            // dropped (it gets replaced by the fold separator that the
811            // next continuation block emits).
812            while matches!(value.chars().last(), Some(' ' | '\t')) {
813                value.pop();
814            }
815
816            // YAML 1.2 §6.5 / §7.3.3: try to fold continuation lines into
817            // the same plain scalar. A continuation line must be:
818            //   * indented strictly more than the scalar's start column,
819            //   * not a document marker (`---` / `...`),
820            //   * not a comment-only line,
821            //   * not empty-with-EOF.
822            // Save state for backtracking if continuation isn't allowed.
823            let saved_position = self.position;
824            let saved_index = self.current_char_index;
825            let saved_char = self.current_char;
826
827            // Count physical newlines we skip; whitespace within the lines
828            // is also consumed.
829            let mut newlines = 0usize;
830            loop {
831                match self.current_char {
832                    Some('\n') => {
833                        newlines += 1;
834                        self.advance();
835                    }
836                    Some('\r') => {
837                        self.advance();
838                    }
839                    Some(' ' | '\t') => {
840                        self.advance();
841                    }
842                    _ => break,
843                }
844            }
845
846            let next_col = self.position.column;
847            let next_ch = self.current_char;
848            let is_doc_marker = matches!(next_ch, Some('-' | '.'))
849                && self.peek_char(1) == next_ch
850                && self.peek_char(2) == next_ch
851                && self.peek_char(3).map_or(true, |c| c.is_whitespace());
852
853            // Continuation column rule:
854            //   * Flow context: no column rule, only flow indicators
855            //     terminate (8KB6, 8UDB, 9BXH).
856            //   * Block context: must be strictly deeper than the parent
857            //     block's key column. The parent indent is the max of
858            //     `indent_stack.last()` (block mapping/sequence indent)
859            //     and `compact_sequence_indents.last()` — the latter
860            //     tracks sequences opened compactly (e.g. `? - x` where
861            //     the dash didn't push to indent_stack). Without the
862            //     compact-stack check, `? - Detroit Tigers\n  - Chicago`
863            //     would fold both lines into one scalar (yaml-test-
864            //     suite M5DY).
865            //     Fall back to `next_col >= start_col` for top-level
866            //     scalars where there's no enclosing block.
867            let column_ok = if self.flow_level > 0 {
868                true
869            } else {
870                let block_indent = self.indent_stack.last().copied().unwrap_or(0);
871                let compact_indent = self.compact_sequence_indents.last().copied().unwrap_or(0);
872                let parent_indent = block_indent.max(compact_indent);
873                next_col >= parent_indent + 2 || next_col >= start_col
874            };
875            let can_continue = next_ch.is_some()
876                && !matches!(next_ch, Some('\n' | '\r' | '#'))
877                && column_ok
878                && !is_doc_marker
879                && !(self.flow_level > 0 && matches!(next_ch, Some(',' | ']' | '}')));
880
881            if !can_continue {
882                self.position = saved_position;
883                self.current_char_index = saved_index;
884                self.current_char = saved_char;
885                break;
886            }
887
888            // Append fold separator: single newline → space; N>1 newlines
889            // collapse to N-1 retained newlines (YAML §6.5 line folding).
890            if newlines <= 1 {
891                value.push(' ');
892            } else {
893                for _ in 0..(newlines - 1) {
894                    value.push('\n');
895                }
896            }
897            multi_line = true;
898        }
899
900        // YAML 1.2 §8.1.3: implicit keys must be on a single line. If the
901        // plain scalar folded across line breaks AND the next non-
902        // whitespace char is `:` (key-value separator), it's about to be
903        // used as an implicit key — reject (yaml-test-suite G7JE).
904        if multi_line && self.flow_level == 0 {
905            let mut off = 0isize;
906            while matches!(self.peek_char(off), Some(' ' | '\t')) {
907                off += 1;
908            }
909            if self.peek_char(off) == Some(':') {
910                return Err(Error::scan(
911                    self.position,
912                    "Multi-line plain scalar may not be used as an implicit key".to_string(),
913                ));
914            }
915        }
916
917        self.resource_tracker
918            .check_string_length(&self.limits, value.len())?;
919
920        let value = value.trim_end().to_string();
921        let normalized_value = Self::normalize_scalar(value);
922
923        Ok(Token::new(
924            TokenType::Scalar(normalized_value, tokens::QuoteStyle::Plain),
925            start_pos,
926            self.position,
927        ))
928    }
929
930    /// Scan a quoted string
931    fn scan_quoted_string(&mut self, quote_char: char) -> Result<Token> {
932        let start_pos = self.position;
933        let mut value = String::new();
934
935        // Determine quote style based on quote character
936        let quote_style = match quote_char {
937            '\'' => tokens::QuoteStyle::Single,
938            '"' => tokens::QuoteStyle::Double,
939            _ => tokens::QuoteStyle::Plain,
940        };
941
942        self.advance(); // Skip opening quote
943        let mut closed = false;
944        let mut multi_line = false;
945        // High-water mark of bytes contributed by escape sequences. The
946        // trailing-whitespace strip at fold time must not pop past it,
947        // because an escape-produced \t / space is literal content
948        // (yaml-test-suite DE56/00, DE56/01).
949        let mut escape_end: usize = 0;
950
951        while let Some(ch) = self.current_char {
952            if ch == quote_char {
953                // YAML 1.2 §7.3.2 (Single-Quoted): `''` is the only escape,
954                // collapsing to a single `'`. Detect that here BEFORE
955                // treating the quote as the closing delimiter.
956                if quote_char == '\'' && self.peek_char(1) == Some('\'') {
957                    value.push('\'');
958                    self.advance();
959                    self.advance();
960                    continue;
961                }
962                self.advance(); // Skip closing quote
963                closed = true;
964                break;
965            } else if ch == '\\' && quote_char == '"' {
966                self.advance();
967                if let Some(escaped) = self.current_char {
968                    match escaped {
969                        // YAML 1.2 §5.7 double-quoted escape allowlist.
970                        'n' => value.push('\n'),
971                        't' => value.push('\t'),
972                        'r' => value.push('\r'),
973                        '\\' => value.push('\\'),
974                        '"' => value.push('"'),
975                        '0' => value.push('\0'),
976                        'a' => value.push('\x07'),
977                        'b' => value.push('\x08'),
978                        'f' => value.push('\x0C'),
979                        'v' => value.push('\x0B'),
980                        'e' => value.push('\x1B'),
981                        ' ' => value.push(' '),
982                        '/' => value.push('/'),
983                        'N' => value.push('\u{0085}'),
984                        '_' => value.push('\u{00A0}'),
985                        'L' => value.push('\u{2028}'),
986                        'P' => value.push('\u{2029}'),
987                        '\n' => {
988                            // Escaped line break (§7.3.2): the newline is
989                            // dropped AND leading whitespace on the next
990                            // line is excluded from the content.
991                            self.advance();
992                            while matches!(self.current_char, Some(' ' | '\t')) {
993                                self.advance();
994                            }
995                            continue;
996                        }
997                        '\t' => value.push('\t'), // literal tab after `\` → tab (yaml-test-suite 3RLN/DE56)
998                        // Hex / Unicode escapes per YAML 1.2 §5.7:
999                        //   \xNN     — 2 hex digits, codepoint  ≤ 0xFF
1000                        //   \uNNNN   — 4 hex digits, codepoint  ≤ 0xFFFF
1001                        //   \UNNNNNNNN — 8 hex digits, full Unicode codepoint
1002                        'x' | 'u' | 'U' => {
1003                            let n = match escaped {
1004                                'x' => 2,
1005                                'u' => 4,
1006                                _ => 8,
1007                            };
1008                            self.advance(); // consume the x/u/U
1009                            let mut codepoint: u32 = 0;
1010                            for _ in 0..n {
1011                                let c = self.current_char.ok_or_else(|| {
1012                                    Error::scan(
1013                                        self.position,
1014                                        format!("Truncated \\{escaped} escape"),
1015                                    )
1016                                })?;
1017                                let d = c.to_digit(16).ok_or_else(|| {
1018                                    Error::scan(
1019                                        self.position,
1020                                        format!("Invalid hex digit `{c}` in \\{escaped} escape"),
1021                                    )
1022                                })?;
1023                                codepoint = (codepoint << 4) | d;
1024                                self.advance();
1025                            }
1026                            let ch = char::from_u32(codepoint).ok_or_else(|| {
1027                                Error::scan(
1028                                    self.position,
1029                                    format!("Invalid Unicode codepoint U+{codepoint:X}"),
1030                                )
1031                            })?;
1032                            value.push(ch);
1033                            escape_end = value.len();
1034                            continue; // already advanced past hex digits
1035                        }
1036                        // Everything else is invalid per spec.
1037                        _ => {
1038                            return Err(Error::scan(
1039                                self.position,
1040                                format!("Invalid escape sequence: \\{escaped}"),
1041                            ));
1042                        }
1043                    }
1044                    escape_end = value.len();
1045                    self.advance();
1046                }
1047            } else if ch == '\\' {
1048                // Single-quoted strings have no backslash escapes — `\` is
1049                // a literal character. (Single-quote escape is `''`.)
1050                value.push(ch);
1051                self.advance();
1052            } else if ch == '\n' || ch == '\r' {
1053                // YAML 1.2 §7.3.2 (double-quoted) / §7.3.3 (single-quoted)
1054                // line folding: a single newline within a quoted scalar
1055                // folds to a space; N>1 consecutive newlines retain N-1;
1056                // leading whitespace on the continuation line is excluded.
1057                let mut newlines = 0usize;
1058                // §6.1: tabs cannot be indentation. A continuation
1059                // line that BEGINS with a tab (no leading spaces) in
1060                // an enclosing block context is invalid (yaml-test-
1061                // suite DK95/01). Tabs that appear AFTER spaces in
1062                // the same indent area are content, not indentation.
1063                let mut just_after_newline = false;
1064                while let Some(c) = self.current_char {
1065                    match c {
1066                        '\n' => {
1067                            newlines += 1;
1068                            multi_line = true;
1069                            self.advance();
1070                            just_after_newline = true;
1071                        }
1072                        '\r' => {
1073                            self.advance();
1074                        }
1075                        ' ' => {
1076                            self.advance();
1077                            just_after_newline = false;
1078                        }
1079                        '\t' if just_after_newline
1080                            && self.flow_level == 0
1081                            && (self.indent_stack.len() > 1
1082                                || !self.compact_sequence_indents.is_empty()) =>
1083                        {
1084                            return Err(Error::scan(
1085                                self.position,
1086                                "Tab cannot serve as indentation of quoted scalar continuation"
1087                                    .to_string(),
1088                            ));
1089                        }
1090                        '\t' => {
1091                            self.advance();
1092                        }
1093                        _ => break,
1094                    }
1095                }
1096                // §8.1.4: a multi-line quoted scalar inside a block
1097                // context must indent each continuation more than the
1098                // enclosing block. \`quoted: "a\\nb"\` with \`b\` at col 1
1099                // violates the rule because \`quoted:\` sits at indent 0
1100                // (yaml-test-suite QB6E). Only fires when there IS an
1101                // enclosing block (indent_stack > [0] or compact-seq
1102                // active) — top-level quoted scalars with continuation
1103                // at col 1 are legal.
1104                if newlines > 0
1105                    && self.flow_level == 0
1106                    && (self.indent_stack.len() > 1 || !self.compact_sequence_indents.is_empty())
1107                    && !matches!(self.current_char, None | Some('\n' | '\r'))
1108                {
1109                    let parent_indent = self.indent_stack.last().copied().unwrap_or(0);
1110                    let indent = self.position.column.saturating_sub(1);
1111                    if indent <= parent_indent {
1112                        return Err(Error::scan(
1113                            self.position,
1114                            "Quoted scalar continuation line is not indented enough".to_string(),
1115                        ));
1116                    }
1117                }
1118                // §6.8: a doc-start/end marker (`---` or `...`) at
1119                // column 1 always terminates the current document.
1120                // Encountering one inside an unterminated quoted
1121                // scalar is invalid — the quote escapes nothing past
1122                // the doc boundary (yaml-test-suite 5TRB, RXY3,
1123                // 9MQT/01).
1124                if self.position.column == 1 {
1125                    let next3: String = self
1126                        .char_cache
1127                        .get(self.current_char_index..self.current_char_index + 3)
1128                        .map(|s| s.iter().collect())
1129                        .unwrap_or_default();
1130                    if (next3 == "---" || next3 == "...")
1131                        && self
1132                            .char_cache
1133                            .get(self.current_char_index + 3)
1134                            .map_or(true, |c| c.is_whitespace())
1135                    {
1136                        return Err(Error::scan(
1137                            self.position,
1138                            format!(
1139                                "Document {} marker `{}` inside quoted scalar",
1140                                if next3 == "---" { "start" } else { "end" },
1141                                next3
1142                            ),
1143                        ));
1144                    }
1145                }
1146                // Drop trailing whitespace on the prior line (the bytes
1147                // we already pushed) before applying the fold. Don't
1148                // strip past `escape_end` — escape-produced whitespace
1149                // is literal content, not "trailing" line whitespace.
1150                while value.len() > escape_end && matches!(value.chars().last(), Some(' ' | '\t')) {
1151                    value.pop();
1152                }
1153                if newlines <= 1 {
1154                    value.push(' ');
1155                } else {
1156                    for _ in 0..(newlines - 1) {
1157                        value.push('\n');
1158                    }
1159                }
1160            } else {
1161                value.push(ch);
1162                self.advance();
1163
1164                // Check string length periodically to fail fast
1165                if value.len() > self.limits.max_string_length {
1166                    return Err(Error::limit_exceeded(format!(
1167                        "String length {} exceeds maximum {}",
1168                        value.len(),
1169                        self.limits.max_string_length
1170                    )));
1171                }
1172            }
1173        }
1174
1175        // Check string length limit
1176        if !closed {
1177            return Err(Error::scan(
1178                self.position,
1179                format!(
1180                    "Unclosed {} quoted string",
1181                    if quote_char == '"' {
1182                        "double"
1183                    } else {
1184                        "single"
1185                    }
1186                ),
1187            ));
1188        }
1189
1190        self.resource_tracker
1191            .check_string_length(&self.limits, value.len())?;
1192
1193        // YAML 1.2 §7.3.1 / §7.3.2: after the closing quote, the rest of
1194        // the line (or sub-expression in flow context) must be empty save
1195        // for a separator. Skip horizontal whitespace and look at the next
1196        // non-space char; if it's content rather than `,`/`:`/`}`/`]`/`#`/
1197        // newline/EOF, it's a trailing-content error (yaml-test-suite
1198        // Q4CL: `"quoted2" trailing content`).
1199        {
1200            let mut offset = 0isize;
1201            let mut saw_space = false;
1202            while matches!(self.peek_char(offset), Some(' ' | '\t')) {
1203                saw_space = true;
1204                offset += 1;
1205            }
1206            let next = self.peek_char(offset);
1207            // A `#` is a comment indicator ONLY when preceded by whitespace
1208            // (YAML 1.2 §6.6); `"value"#cmt` is invalid.
1209            let ok = match next {
1210                None => true,
1211                Some('#') => saw_space,
1212                Some(c) => matches!(c, ',' | ':' | '}' | ']' | '\n' | '\r'),
1213            };
1214            if !ok {
1215                return Err(Error::scan(
1216                    self.position,
1217                    format!("Unexpected `{}` after quoted scalar", next.unwrap_or(' ')),
1218                ));
1219            }
1220            // YAML 1.2 §8.1.3: implicit keys must be on a single line.
1221            // If the scalar folded across line breaks AND the next non-
1222            // whitespace char is `:` (key-value separator), the scalar
1223            // is being used as an implicit key — error.
1224            if multi_line && self.flow_level == 0 && next == Some(':') {
1225                return Err(Error::scan(
1226                    self.position,
1227                    "Multi-line quoted scalar may not be used as an implicit key".to_string(),
1228                ));
1229            }
1230        }
1231
1232        Ok(Token::new(
1233            TokenType::Scalar(value, quote_style),
1234            start_pos,
1235            self.position,
1236        ))
1237    }
1238
1239    /// Scan document start marker (---)
1240    fn scan_document_start(&mut self) -> Result<Option<Token>> {
1241        if self.current_char == Some('-')
1242            && self.peek_char(1) == Some('-')
1243            && self.peek_char(2) == Some('-')
1244            && self.peek_char(3).map_or(true, |c| c.is_whitespace())
1245        {
1246            // Doc markers are invalid inside flow collections.
1247            if self.flow_level > 0 {
1248                return Err(Error::scan(
1249                    self.position,
1250                    "`---` document-start marker is not allowed inside a flow collection"
1251                        .to_string(),
1252                ));
1253            }
1254            let start_pos = self.position;
1255            self.advance(); // -
1256            self.advance(); // -
1257            self.advance(); // -
1258
1259            Ok(Some(Token::new(
1260                TokenType::DocumentStart,
1261                start_pos,
1262                self.position,
1263            )))
1264        } else {
1265            Ok(None)
1266        }
1267    }
1268
1269    /// Scan YAML version directive (%YAML)
1270    fn scan_yaml_directive(&mut self) -> Result<Option<Token>> {
1271        if self.current_char != Some('%') {
1272            return Ok(None);
1273        }
1274
1275        let start_pos = self.position;
1276        let saved_position = self.position;
1277        self.advance(); // Skip '%'
1278
1279        // Check for "YAML"
1280        if self.current_char == Some('Y')
1281            && self.peek_char(1) == Some('A')
1282            && self.peek_char(2) == Some('M')
1283            && self.peek_char(3) == Some('L')
1284            && self.peek_char(4).map_or(false, |c| c.is_whitespace())
1285        {
1286            self.advance(); // Y
1287            self.advance(); // A
1288            self.advance(); // M
1289            self.advance(); // L
1290
1291            // Skip whitespace
1292            self.skip_whitespace();
1293
1294            // Parse version number (e.g., "1.2")
1295            let major = if let Some(ch) = self.current_char {
1296                if ch.is_ascii_digit() {
1297                    let digit = ch.to_digit(10).unwrap() as u8;
1298                    self.advance();
1299                    digit
1300                } else {
1301                    return Err(Error::scan(
1302                        self.position,
1303                        "Expected major version number after %YAML".to_string(),
1304                    ));
1305                }
1306            } else {
1307                return Err(Error::scan(
1308                    self.position,
1309                    "Expected version after %YAML directive".to_string(),
1310                ));
1311            };
1312
1313            // Expect '.'
1314            if self.current_char != Some('.') {
1315                return Err(Error::scan(
1316                    self.position,
1317                    "Expected '.' in YAML version".to_string(),
1318                ));
1319            }
1320            self.advance();
1321
1322            // Parse minor version
1323            let minor = if let Some(ch) = self.current_char {
1324                if ch.is_ascii_digit() {
1325                    let digit = ch.to_digit(10).unwrap() as u8;
1326                    self.advance();
1327                    digit
1328                } else {
1329                    return Err(Error::scan(
1330                        self.position,
1331                        "Expected minor version number after '.'".to_string(),
1332                    ));
1333                }
1334            } else {
1335                return Err(Error::scan(
1336                    self.position,
1337                    "Expected minor version number".to_string(),
1338                ));
1339            };
1340
1341            // YAML 1.2 §6.8.1: the directive line must end after the
1342            // version (modulo whitespace and an optional comment). Extra
1343            // tokens (e.g. `%YAML 1.2 foo`) are invalid — yaml-test-suite
1344            // H7TQ. Also `%YAML 1.1#...` (yaml-test-suite MUS6/00) needs
1345            // whitespace before `#`.
1346            let mut saw_space = false;
1347            while matches!(self.current_char, Some(' ' | '\t')) {
1348                saw_space = true;
1349                self.advance();
1350            }
1351            match self.current_char {
1352                None | Some('\n' | '\r') => {}
1353                Some('#') if saw_space => {
1354                    while let Some(ch) = self.current_char {
1355                        if ch == '\n' || ch == '\r' {
1356                            break;
1357                        }
1358                        self.advance();
1359                    }
1360                }
1361                Some(c) => {
1362                    return Err(Error::scan(
1363                        self.position,
1364                        format!("Unexpected `{c}` after %YAML directive"),
1365                    ));
1366                }
1367            }
1368
1369            Ok(Some(Token::new(
1370                TokenType::YamlDirective(major, minor),
1371                start_pos,
1372                self.position,
1373            )))
1374        } else {
1375            // Not a YAML directive, reset position
1376            self.position = saved_position;
1377            // Properly reset current_char based on saved position
1378            self.current_char = self
1379                .char_indices
1380                .iter()
1381                .find(|(i, _)| *i == saved_position.index)
1382                .map(|(_, ch)| *ch);
1383            // Reset the current_char_index
1384            self.current_char_index = self
1385                .char_indices
1386                .iter()
1387                .position(|(i, _)| *i == saved_position.index)
1388                .unwrap_or(0);
1389            Ok(None)
1390        }
1391    }
1392
1393    /// Scan TAG directive (%TAG)
1394    fn scan_tag_directive(&mut self) -> Result<Option<Token>> {
1395        if self.current_char != Some('%') {
1396            return Ok(None);
1397        }
1398
1399        let start_pos = self.position;
1400        let saved_position = self.position;
1401        self.advance(); // Skip '%'
1402
1403        // Check for "TAG"
1404        if self.current_char == Some('T')
1405            && self.peek_char(1) == Some('A')
1406            && self.peek_char(2) == Some('G')
1407            && self.peek_char(3).map_or(false, |c| c.is_whitespace())
1408        {
1409            self.advance(); // T
1410            self.advance(); // A
1411            self.advance(); // G
1412
1413            // Skip whitespace
1414            self.skip_whitespace();
1415
1416            // Parse handle (e.g., "!" or "!!")
1417            let handle = self.scan_tag_handle()?;
1418
1419            // Skip whitespace
1420            self.skip_whitespace();
1421
1422            // Parse prefix (URI)
1423            let prefix = self.scan_tag_prefix()?;
1424
1425            Ok(Some(Token::new(
1426                TokenType::TagDirective(handle, prefix),
1427                start_pos,
1428                self.position,
1429            )))
1430        } else {
1431            // Reset position if not a TAG directive
1432            self.position = saved_position;
1433            // Properly reset current_char based on saved position
1434            self.current_char = self
1435                .char_indices
1436                .iter()
1437                .find(|(i, _)| *i == saved_position.index)
1438                .map(|(_, ch)| *ch);
1439            // Reset the current_char_index
1440            self.current_char_index = self
1441                .char_indices
1442                .iter()
1443                .position(|(i, _)| *i == saved_position.index)
1444                .unwrap_or(0);
1445            Ok(None)
1446        }
1447    }
1448
1449    /// Scan a tag handle for TAG directive
1450    fn scan_tag_handle(&mut self) -> Result<String> {
1451        let mut handle = String::new();
1452
1453        if self.current_char != Some('!') {
1454            return Err(Error::scan(
1455                self.position,
1456                "Expected '!' at start of tag handle".to_string(),
1457            ));
1458        }
1459
1460        handle.push('!');
1461        self.advance();
1462
1463        // Handle can be "!" or "!!" or "!name!"
1464        if self.current_char == Some('!') {
1465            // Secondary handle "!!"
1466            handle.push('!');
1467            self.advance();
1468        } else if self.current_char.map_or(false, |c| c.is_alphanumeric()) {
1469            // Named handle like "!name!"
1470            while let Some(ch) = self.current_char {
1471                if ch.is_alphanumeric() || ch == '-' || ch == '_' {
1472                    handle.push(ch);
1473                    self.advance();
1474                } else if ch == '!' {
1475                    handle.push(ch);
1476                    self.advance();
1477                    break;
1478                } else {
1479                    break;
1480                }
1481            }
1482        }
1483        // else just "!" primary handle
1484
1485        Ok(handle)
1486    }
1487
1488    /// Scan a tag prefix (URI) for TAG directive
1489    fn scan_tag_prefix(&mut self) -> Result<String> {
1490        let mut prefix = String::new();
1491
1492        // Read until end of line or comment
1493        while let Some(ch) = self.current_char {
1494            if ch == '\n' || ch == '\r' || ch == '#' {
1495                break;
1496            }
1497            if ch.is_whitespace() && prefix.is_empty() {
1498                self.advance();
1499                continue;
1500            }
1501            if ch.is_whitespace() && !prefix.is_empty() {
1502                // Trailing whitespace, we're done
1503                break;
1504            }
1505            prefix.push(ch);
1506            self.advance();
1507        }
1508
1509        if prefix.is_empty() {
1510            return Err(Error::scan(
1511                self.position,
1512                "Expected tag prefix after tag handle".to_string(),
1513            ));
1514        }
1515
1516        Ok(prefix.trim().to_string())
1517    }
1518
1519    /// Check if current position might be a directive
1520    fn is_directive(&self) -> bool {
1521        self.current_char == Some('%') && self.position.column == 1
1522    }
1523
1524    /// Scan document end marker (...)
1525    fn scan_document_end(&mut self) -> Result<Option<Token>> {
1526        if self.current_char == Some('.')
1527            && self.peek_char(1) == Some('.')
1528            && self.peek_char(2) == Some('.')
1529            && self.peek_char(3).map_or(true, |c| c.is_whitespace())
1530        {
1531            // Doc markers are invalid inside flow collections.
1532            if self.flow_level > 0 {
1533                return Err(Error::scan(
1534                    self.position,
1535                    "`...` document-end marker is not allowed inside a flow collection".to_string(),
1536                ));
1537            }
1538            let start_pos = self.position;
1539            self.advance(); // .
1540            self.advance(); // .
1541            self.advance(); // .
1542
1543            // YAML 1.2 §6.4: `...` must be followed only by whitespace or
1544            // end-of-line (comments allowed). Inline content after `...`
1545            // is invalid (yaml-test-suite 3HFZ).
1546            while let Some(ch) = self.current_char {
1547                match ch {
1548                    ' ' | '\t' => {
1549                        self.advance();
1550                    }
1551                    '\n' | '\r' | '#' => break,
1552                    _ => {
1553                        return Err(Error::scan(
1554                            self.position,
1555                            "Content after `...` document-end marker is invalid".to_string(),
1556                        ));
1557                    }
1558                }
1559            }
1560
1561            Ok(Some(Token::new(
1562                TokenType::DocumentEnd,
1563                start_pos,
1564                self.position,
1565            )))
1566        } else {
1567            Ok(None)
1568        }
1569    }
1570
1571    /// Scan a comment token
1572    fn scan_comment(&mut self) -> Result<Token> {
1573        let start_pos = self.position;
1574        let mut comment_text = String::new();
1575
1576        // Skip the '#' character
1577        if self.current_char == Some('#') {
1578            self.advance();
1579        }
1580
1581        // Collect the comment text
1582        while let Some(ch) = self.current_char {
1583            if ch == '\n' || ch == '\r' {
1584                break;
1585            }
1586            comment_text.push(ch);
1587            self.advance();
1588        }
1589
1590        // Trim leading whitespace from comment text
1591        let comment_text = comment_text.trim_start().to_string();
1592
1593        Ok(Token::new(
1594            TokenType::Comment(comment_text),
1595            start_pos,
1596            self.position,
1597        ))
1598    }
1599
1600    /// Process a line and generate appropriate tokens
1601    #[allow(clippy::cognitive_complexity)]
1602    fn process_line(&mut self) -> Result<()> {
1603        // Check for directives at start of line
1604        if self.position.column == 1 && self.current_char == Some('%') {
1605            // Try to scan YAML directive
1606            if let Some(token) = self.scan_yaml_directive()? {
1607                self.tokens.push(token);
1608                return Ok(());
1609            }
1610
1611            // Try to scan TAG directive
1612            if let Some(token) = self.scan_tag_directive()? {
1613                self.tokens.push(token);
1614                return Ok(());
1615            }
1616
1617            // YAML 1.2 §6.8.4: a YAML processor MUST ignore directives it
1618            // does not recognize. Skip the line silently — parsing continues
1619            // with whatever follows on the next line.
1620            if self.current_char == Some('%') {
1621                while let Some(ch) = self.current_char {
1622                    if ch == '\n' || ch == '\r' {
1623                        break;
1624                    }
1625                    self.advance();
1626                }
1627                return Ok(());
1628            }
1629        }
1630
1631        // Check for document markers at start of line
1632        if self.position.column == 1 {
1633            // Check for document start marker
1634            if let Some(token) = self.scan_document_start()? {
1635                self.tokens.push(token);
1636                return Ok(());
1637            }
1638
1639            // Check for document end marker
1640            if let Some(token) = self.scan_document_end()? {
1641                self.tokens.push(token);
1642                return Ok(());
1643            }
1644        }
1645
1646        // Handle indentation at start of line
1647        if self.position.column == 1 {
1648            self.handle_indentation()?;
1649        }
1650
1651        // Skip empty lines and comments
1652        self.skip_whitespace();
1653
1654        match self.current_char {
1655            None => return Ok(()),
1656            Some('#') => {
1657                if self.preserve_comments {
1658                    // Create a comment token
1659                    let comment_token = self.scan_comment()?;
1660                    self.tokens.push(comment_token);
1661                } else {
1662                    // Skip comment lines
1663                    while let Some(ch) = self.current_char {
1664                        if ch == '\n' || ch == '\r' {
1665                            break;
1666                        }
1667                        self.advance();
1668                    }
1669                }
1670                return Ok(());
1671            }
1672            Some('\n' | '\r') => {
1673                self.advance();
1674                return Ok(());
1675            }
1676            _ => {}
1677        }
1678
1679        // Process tokens on this line
1680        while let Some(ch) = self.current_char {
1681            match ch {
1682                '\n' | '\r' => break,
1683                ' ' | '\t' => {
1684                    self.skip_whitespace();
1685                }
1686                '#' => {
1687                    // YAML 1.2 §6.6: a comment must be preceded by whitespace
1688                    // OR be at the start of a line. Inputs like `,#invalid`
1689                    // (yaml-test-suite CVW2) are not valid comments.
1690                    let prev = self.peek_char(-1);
1691                    let at_line_start = self.position.column == 1;
1692                    let preceded_by_space = prev.map_or(true, |c| c.is_whitespace());
1693                    if !at_line_start && !preceded_by_space {
1694                        return Err(Error::scan(
1695                            self.position,
1696                            "Comment `#` must be preceded by whitespace".to_string(),
1697                        ));
1698                    }
1699                    if self.preserve_comments {
1700                        let comment_token = self.scan_comment()?;
1701                        self.tokens.push(comment_token);
1702                    } else {
1703                        while let Some(ch) = self.current_char {
1704                            if ch == '\n' || ch == '\r' {
1705                                break;
1706                            }
1707                            self.advance();
1708                        }
1709                    }
1710                    break;
1711                }
1712
1713                // Flow indicators. §7.4 allows a flow collection as
1714                // the implicit key of a block mapping (`[a]: b`,
1715                // `{x: y}: z`). When the flow-open is at line-start
1716                // (block context) and a `:` follows on the same line,
1717                // open the wrapping block mapping at the column of the
1718                // flow-open token, just as we do for line-start
1719                // properties (yaml-test-suite LX3P, 4FJ6, M2N8/01).
1720                '[' => {
1721                    if self.flow_level == 0
1722                        && self.position.column == self.current_indent + 1
1723                        && self.check_for_mapping_ahead()
1724                    {
1725                        self.maybe_open_block_mapping_for_key()?;
1726                    }
1727                    let pos = self.position;
1728                    self.advance();
1729                    self.flow_level += 1;
1730                    // Check depth limit
1731                    self.resource_tracker
1732                        .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
1733                    self.tokens
1734                        .push(Token::new(TokenType::FlowSequenceStart, pos, self.position));
1735                }
1736                ']' => {
1737                    // YAML 1.2 §7.4: `]` is only valid inside an open
1738                    // flow sequence. Stray `]` is a syntax error
1739                    // (yaml-test-suite 4H7K).
1740                    if self.flow_level == 0 {
1741                        let context = ErrorContext::from_input(&self.input, &self.position, 2)
1742                            .with_suggestion(
1743                                "Remove the extra `]` or open a flow sequence with `[` first"
1744                                    .to_string(),
1745                            );
1746                        return Err(Error::scan_with_context(
1747                            self.position,
1748                            "Unexpected `]` outside flow context",
1749                            context,
1750                        ));
1751                    }
1752                    let pos = self.position;
1753                    self.advance();
1754                    self.flow_level -= 1;
1755                    self.tokens
1756                        .push(Token::new(TokenType::FlowSequenceEnd, pos, self.position));
1757                }
1758                '{' => {
1759                    if self.flow_level == 0
1760                        && self.position.column == self.current_indent + 1
1761                        && self.check_for_mapping_ahead()
1762                    {
1763                        self.maybe_open_block_mapping_for_key()?;
1764                    }
1765                    let pos = self.position;
1766                    self.advance();
1767                    self.flow_level += 1;
1768                    // Check depth limit
1769                    self.resource_tracker
1770                        .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
1771                    self.tokens
1772                        .push(Token::new(TokenType::FlowMappingStart, pos, self.position));
1773                }
1774                '}' => {
1775                    if self.flow_level == 0 {
1776                        let context = ErrorContext::from_input(&self.input, &self.position, 2)
1777                            .with_suggestion(
1778                                "Remove the extra `}` or open a flow mapping with `{` first"
1779                                    .to_string(),
1780                            );
1781                        return Err(Error::scan_with_context(
1782                            self.position,
1783                            "Unexpected `}` outside flow context",
1784                            context,
1785                        ));
1786                    }
1787                    let pos = self.position;
1788                    self.advance();
1789                    self.flow_level -= 1;
1790                    self.tokens
1791                        .push(Token::new(TokenType::FlowMappingEnd, pos, self.position));
1792                }
1793                ',' => {
1794                    // §7.4: \`,\` is a flow indicator. Outside flow
1795                    // context it's not meaningful as a structural
1796                    // separator (yaml-test-suite U99R: \`- !!str, xxx\`
1797                    // — the comma after a tag in block context is
1798                    // invalid).
1799                    if self.flow_level == 0 {
1800                        return Err(Error::scan(
1801                            self.position,
1802                            "Unexpected `,` outside flow context".to_string(),
1803                        ));
1804                    }
1805                    let pos = self.position;
1806                    self.advance();
1807                    self.tokens
1808                        .push(Token::new(TokenType::FlowEntry, pos, self.position));
1809                }
1810
1811                // Key-value separator. YAML 1.2 §7.3.3 / §7.4:
1812                //   * Block context: `:` separates key from value only when
1813                //     followed by whitespace / EOF — otherwise it's part of
1814                //     a plain scalar (e.g. `:foo`, `URL://path`).
1815                //   * Flow context: same, plus `:` may be adjacent to a
1816                //     value when the previous token completed a key node
1817                //     (quoted/plain scalar, alias, or closed flow
1818                //     collection) — see yaml-test-suite 5MUD, 5T43.
1819                ':' if self.peek_char(1).map_or(true, |c| {
1820                    c.is_whitespace() || (self.flow_level > 0 && ",[]{}".contains(c))
1821                }) || (self.flow_level > 0
1822                    && matches!(
1823                        self.tokens.last().map(|t| &t.token_type),
1824                        Some(
1825                            TokenType::Scalar(_, _)
1826                                | TokenType::Alias(_)
1827                                | TokenType::FlowMappingEnd
1828                                | TokenType::FlowSequenceEnd
1829                        )
1830                    )) =>
1831                {
1832                    // §6.2: a \`:\` at line-start (the explicit-value
1833                    // counterpart of an explicit \`?\` key) must be
1834                    // followed by a SPACE — a tab as separator is
1835                    // invalid (yaml-test-suite Y79Y/007, /009).
1836                    if self.flow_level == 0
1837                        && self.position.column == self.current_indent + 1
1838                        && self.peek_char(1) == Some('\t')
1839                    {
1840                        return Err(Error::scan(
1841                            self.position,
1842                            "Tab cannot follow line-start `:` as explicit-value separator"
1843                                .to_string(),
1844                        ));
1845                    }
1846                    // §8.22: an implicit key in block context must fit
1847                    // on a single line. If the previous token is a
1848                    // flow-collection close whose matching open is on
1849                    // a different line, the flow node spans multiple
1850                    // lines and can't serve as the key (yaml-test-
1851                    // suite C2SP \`[23\\n]: 42\`).
1852                    if self.flow_level == 0 {
1853                        let mut is_flow_close = false;
1854                        let mut close_end_line = 0;
1855                        if let Some(last) = self.tokens.last() {
1856                            if matches!(
1857                                last.token_type,
1858                                TokenType::FlowSequenceEnd | TokenType::FlowMappingEnd
1859                            ) {
1860                                is_flow_close = true;
1861                                close_end_line = last.end_position.line;
1862                            }
1863                        }
1864                        if is_flow_close {
1865                            let mut depth = 0i32;
1866                            let mut open_idx: Option<usize> = None;
1867                            for (idx, t) in self.tokens.iter().enumerate().rev() {
1868                                match &t.token_type {
1869                                    TokenType::FlowSequenceEnd | TokenType::FlowMappingEnd => {
1870                                        depth += 1;
1871                                    }
1872                                    TokenType::FlowSequenceStart | TokenType::FlowMappingStart => {
1873                                        depth -= 1;
1874                                        if depth == 0 {
1875                                            open_idx = Some(idx);
1876                                            break;
1877                                        }
1878                                    }
1879                                    _ => {}
1880                                }
1881                            }
1882                            if let Some(oi) = open_idx {
1883                                let open_line = self.tokens[oi].start_position.line;
1884                                // If a `?` (Key) token precedes the
1885                                // matching flow open on the same line
1886                                // as the key, the key is explicit and
1887                                // may span lines (yaml-test-suite M5DY
1888                                // \`? [ ...spans... ]: [ ... ]\`).
1889                                let key_marker_before = self.tokens[..oi].iter().rev().any(|t| {
1890                                    matches!(t.token_type, TokenType::Key)
1891                                        && t.start_position.line == open_line
1892                                });
1893                                if !key_marker_before && open_line != close_end_line {
1894                                    return Err(Error::scan(
1895                                        self.position,
1896                                        "Implicit key in block context: flow collection key spans multiple lines"
1897                                            .to_string(),
1898                                    ));
1899                                }
1900                            }
1901                        }
1902                    }
1903                    let pos = self.position;
1904                    self.advance();
1905                    self.tokens
1906                        .push(Token::new(TokenType::Value, pos, self.position));
1907                }
1908
1909                // §6.2: the explicit-key marker \`?\` must be followed
1910                // by a SPACE (or EOL), not a tab. Tab as separator
1911                // after \`?\` is invalid (yaml-test-suite Y79Y/006, /008).
1912                '?' if self.flow_level == 0 && self.peek_char(1) == Some('\t') => {
1913                    return Err(Error::scan(
1914                        self.position,
1915                        "Tab cannot follow `?` as block-key separator".to_string(),
1916                    ));
1917                }
1918
1919                // Explicit key marker. An indented `?` at line-start
1920                // (e.g. `mapping:\\n  ? key`) opens an implicit block
1921                // mapping at this column — same as a line-start scalar
1922                // key. Without this, scan_plain_scalar wouldn't see
1923                // the inner mapping's indent and would wrongly fold
1924                // the key content into a multi-line scalar
1925                // (yaml-test-suite S9E8, KK5P).
1926                '?' if self.flow_level == 0
1927                    && (self.peek_char(1).map_or(true, |c| c.is_whitespace())
1928                        || self.peek_char(1).is_none()) =>
1929                {
1930                    if self.position.column == self.current_indent + 1 {
1931                        self.maybe_open_block_mapping_for_key()?;
1932                    }
1933                    let pos = self.position;
1934                    self.advance();
1935                    self.tokens
1936                        .push(Token::new(TokenType::Key, pos, self.position));
1937                }
1938                '?' if self.flow_level > 0
1939                    && (self
1940                        .peek_char(1)
1941                        .map_or(true, |c| c.is_whitespace() || ",:]}".contains(c))
1942                        || self.peek_char(1).is_none()) =>
1943                {
1944                    let pos = self.position;
1945                    self.advance();
1946                    self.tokens
1947                        .push(Token::new(TokenType::Key, pos, self.position));
1948                }
1949
1950                // Block entry
1951                '-' if self.flow_level == 0
1952                    && (self.peek_char(1).map_or(true, |c| c.is_whitespace())
1953                        || self.peek_char(1).is_none()) =>
1954                {
1955                    // A block-entry \`-\` immediately after a flow
1956                    // collection's close (\`}\`, \`]\`) ON THE SAME LINE
1957                    // is invalid — no separator between the closed
1958                    // flow node and the next sibling (yaml-test-suite
1959                    // P2EQ \`- { y: z }- invalid\`). The same-line guard
1960                    // is essential — a \`}\` on a previous line with a
1961                    // new \`-\` on the next line is perfectly valid.
1962                    //
1963                    // Likewise, a block-entry \`-\` immediately after a
1964                    // property (Anchor / Tag) on the same line is
1965                    // invalid — the property must precede a node, and
1966                    // a block sequence's first \`-\` must begin a line
1967                    // (yaml-test-suite SY6V \`&anchor - x\`).
1968                    if let Some(last) = self.tokens.last() {
1969                        if matches!(
1970                            last.token_type,
1971                            TokenType::FlowMappingEnd | TokenType::FlowSequenceEnd
1972                        ) && last.end_position.line == self.position.line
1973                        {
1974                            return Err(Error::scan(
1975                                self.position,
1976                                "Block-entry `-` immediately after flow collection close"
1977                                    .to_string(),
1978                            ));
1979                        }
1980                        if matches!(last.token_type, TokenType::Anchor(_) | TokenType::Tag(_))
1981                            && last.end_position.line == self.position.line
1982                        {
1983                            return Err(Error::scan(
1984                                self.position,
1985                                "Block-entry `-` cannot follow a property on the same line"
1986                                    .to_string(),
1987                            ));
1988                        }
1989                        // §8.22: a block sequence's first \`-\` must
1990                        // begin on a new line. \`key: - a\` (implicit
1991                        // key, then dash on same line) is invalid
1992                        // (yaml-test-suite 5U3A). But \`? key\\n: - x\`
1993                        // (explicit value-separator on the same line
1994                        // as the dash) IS valid: the \`?\` key sits
1995                        // on a previous line. We distinguish by
1996                        // walking back from the Value: if the
1997                        // preceding non-property token is a Scalar
1998                        // on the same line as the Value, the key
1999                        // is implicit; otherwise it's after \`?\`.
2000                        if matches!(last.token_type, TokenType::Value)
2001                            && last.end_position.line == self.position.line
2002                        {
2003                            let value_line = last.start_position.line;
2004                            let mut prior_scalar_line = None;
2005                            for t in self.tokens.iter().rev().skip(1) {
2006                                match &t.token_type {
2007                                    TokenType::Anchor(_) | TokenType::Tag(_) => {}
2008                                    TokenType::Scalar(..) => {
2009                                        prior_scalar_line = Some(t.end_position.line);
2010                                        break;
2011                                    }
2012                                    _ => break,
2013                                }
2014                            }
2015                            if prior_scalar_line == Some(value_line) {
2016                                return Err(Error::scan(
2017                                    self.position,
2018                                    "Block sequence value cannot start on the same line as its key"
2019                                        .to_string(),
2020                                ));
2021                            }
2022                        }
2023                    }
2024                    let pos = self.position;
2025                    self.advance();
2026
2027                    // Check if we need to start a new block sequence.
2028                    // `unwrap_or(0)` mirrors the pattern in
2029                    // src/scanner/indentation.rs and is safer than
2030                    // `.unwrap()` here: an error-recovery pop in another
2031                    // path could otherwise leave the stack empty and
2032                    // panic on crafted input (#18).
2033                    let last_indent = self.indent_stack.last().copied().unwrap_or(0);
2034
2035                    // If a compact sequence (opened from `? - x` or
2036                    // similar) is already active at this dash's column,
2037                    // the dash continues it — don't open a new nested
2038                    // block sequence (yaml-test-suite M5DY).
2039                    let dash_indent = pos.column.saturating_sub(1);
2040                    let compact_active_here = self
2041                        .compact_sequence_indents
2042                        .last()
2043                        .map_or(false, |&si| si == dash_indent);
2044                    if compact_active_here {
2045                        // Continuation of an existing compact sequence.
2046                    } else if self.current_indent > last_indent {
2047                        // Deeper indentation - start new nested sequence
2048                        self.indent_stack.push(self.current_indent);
2049                        self.indent_is_sequence.push(true);
2050                        // Check depth limit
2051                        self.resource_tracker
2052                            .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
2053                        self.tokens
2054                            .push(Token::simple(TokenType::BlockSequenceStart, pos));
2055                    } else if self.current_indent == last_indent
2056                        && *self.indent_is_sequence.last().unwrap_or(&false)
2057                    {
2058                        // Same indent and the top of stack is already a sequence
2059                        // → continuation of that sequence; no new start needed.
2060                    } else if self.current_indent >= last_indent {
2061                        // Same or root level — compact notation.
2062                        // Start a new sequence only if we don't already have one
2063                        // tracked at this exact indent.
2064                        // For a dash that's *not* at line-start (e.g.
2065                        // `? - x` where current_indent is still the
2066                        // line's indent but the dash sits in mid-line),
2067                        // use the dash column - 1 as the sequence's
2068                        // indent so scan_plain_scalar's continuation
2069                        // check correctly sees the deeper context
2070                        // (yaml-test-suite M5DY).
2071                        let dash_indent = pos.column.saturating_sub(1);
2072                        let seq_indent = dash_indent.max(self.current_indent);
2073                        let has_active_compact = self
2074                            .compact_sequence_indents
2075                            .last()
2076                            .map_or(false, |&si| si == seq_indent);
2077
2078                        if !has_active_compact {
2079                            self.compact_sequence_indents.push(seq_indent);
2080                            // Check depth limit
2081                            self.resource_tracker.check_depth(
2082                                &self.limits,
2083                                self.flow_level + self.indent_stack.len(),
2084                            )?;
2085                            self.tokens
2086                                .push(Token::simple(TokenType::BlockSequenceStart, pos));
2087                        }
2088                    }
2089
2090                    self.tokens
2091                        .push(Token::new(TokenType::BlockEntry, pos, self.position));
2092
2093                    // After emitting BlockEntry, check if the next
2094                    // token is another dash (nested sequence). §6.2
2095                    // requires SPACE separation between dashes — a
2096                    // tab between the outer and inner \`-\` is invalid
2097                    // (yaml-test-suite Y79Y/004, /005). Track whether
2098                    // a tab was consumed while skipping the inter-
2099                    // dash whitespace and reject if so.
2100                    let mut saw_tab_between = false;
2101                    while let Some(c) = self.current_char {
2102                        if c == ' ' {
2103                            self.advance();
2104                        } else if c == '\t' {
2105                            saw_tab_between = true;
2106                            self.advance();
2107                        } else {
2108                            break;
2109                        }
2110                    }
2111                    if self.current_char == Some('-')
2112                        && self.peek_char(1).map_or(true, |c| c.is_whitespace())
2113                        && saw_tab_between
2114                    {
2115                        return Err(Error::scan(
2116                            self.position,
2117                            "Tab between block-entries on same line".to_string(),
2118                        ));
2119                    }
2120                    if self.current_char == Some('-')
2121                        && self.peek_char(1).map_or(true, |c| c.is_whitespace())
2122                    {
2123                        // We have a nested sequence on the same line!
2124                        // Track this as an inline sequence
2125                        self.inline_sequence_depth += 1;
2126                        // Push the *indent* (column - 1), not the
2127                        // column, so it matches the convention used by
2128                        // maybe_open_block_mapping_for_key. With column
2129                        // here the next-line indent (column - 1) would
2130                        // be strictly less than the stored value and
2131                        // wrongly trigger an early close, breaking
2132                        // multi-line nested sequences (yaml-test-suite
2133                        // 3ALJ, 57H4).
2134                        self.indent_stack
2135                            .push(self.position.column.saturating_sub(1));
2136                        self.indent_is_sequence.push(true);
2137                        // Check depth limit
2138                        self.resource_tracker
2139                            .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
2140                        self.tokens
2141                            .push(Token::simple(TokenType::BlockSequenceStart, self.position));
2142                        // Continue processing - the next iteration will handle the nested dash
2143                    } else if self.current_char.is_some()
2144                        && !matches!(self.current_char, Some('\n' | '\r'))
2145                    {
2146                        // Content follows "- " on the same line.
2147                        // Update current_indent to the content's column position so that
2148                        // any mapping started here will be at a deeper indent level than
2149                        // the sequence. This ensures handle_indentation properly closes
2150                        // the mapping when the next sibling "- " appears.
2151                        self.current_indent = self.position.column - 1;
2152                    }
2153                }
2154
2155                // Quoted strings — same implicit-key mapping detection
2156                // as for plain scalars (yaml-test-suite 6H3V, 6SLA).
2157                '"' | '\'' => {
2158                    if self.flow_level == 0 && self.check_for_mapping_ahead() {
2159                        self.maybe_open_block_mapping_for_key()?;
2160                    }
2161                    let token = self.scan_quoted_string(ch)?;
2162                    self.tokens.push(token);
2163                }
2164
2165                // Document markers (only if not a block entry).
2166                //
2167                // Reached only when `-` is at column = current_indent + 1 AND
2168                // the next character is non-whitespace — i.e. either the
2169                // `---` document-start marker OR a plain scalar starting
2170                // with `-` (e.g. `---word1`, `-foo`). If `scan_document_start`
2171                // declines, we MUST consume the run as a plain scalar — not
2172                // consulting `is_plain_scalar_start` here, because that helper
2173                // unconditionally rejects `-`, which would leave the outer
2174                // `while let` loop spinning on the same character.
2175                '-' if self.position.column == self.current_indent + 1
2176                    && !self.peek_char(1).map_or(true, |c| c.is_whitespace()) =>
2177                {
2178                    if let Some(token) = self.scan_document_start()? {
2179                        self.tokens.push(token);
2180                    } else {
2181                        let token = self.scan_plain_scalar()?;
2182                        self.tokens.push(token);
2183                    }
2184                }
2185                '.' if self.position.column == self.current_indent + 1 => {
2186                    if let Some(token) = self.scan_document_end()? {
2187                        self.tokens.push(token);
2188                    } else if self.is_plain_scalar_start() {
2189                        let token = self.scan_plain_scalar()?;
2190                        self.tokens.push(token);
2191                    }
2192                }
2193
2194                // Numbers or plain scalars starting with -
2195                // Only scan as number if the entire token is numeric (no trailing letters)
2196                _ if (ch.is_ascii_digit()
2197                    || (ch == '-' && self.peek_char(1).map_or(false, |c| c.is_ascii_digit())))
2198                    && self.is_pure_number() =>
2199                {
2200                    let token = self.scan_number()?;
2201                    self.tokens.push(token);
2202                }
2203
2204                // Anchors and aliases. §6.9: a node's properties
2205                // (anchor/tag) are prefixes of the node. When an `&`,
2206                // `*`, or `!` is at the start of a line (column ==
2207                // current_indent + 1) and a `: ` follows on the same
2208                // line, the property/alias is part of an implicit
2209                // key's leading position. The block mapping that
2210                // contains this key therefore opens at this column,
2211                // *before* the property/alias token is emitted
2212                // (yaml-test-suite 7BMT, 6BFJ, 9KAX, U3XV, 26DV).
2213                '&' => {
2214                    // Mirror H7J7 check for anchors (yaml-test-suite
2215                    // G9HC \`seq:\\n&anchor\\n- a\`).
2216                    if self.flow_level == 0
2217                        && self.position.column == self.current_indent + 1
2218                        && !self.check_for_mapping_ahead()
2219                        && self.indent_stack.len() > 1
2220                        && self.current_indent == self.indent_stack[self.indent_stack.len() - 2]
2221                        && self.most_recent_token_is_value_separator()
2222                    {
2223                        return Err(Error::scan(
2224                            self.position,
2225                            "Anchor at line-start with insufficient indent for value position"
2226                                .to_string(),
2227                        ));
2228                    }
2229                    if self.flow_level == 0
2230                        && self.position.column == self.current_indent + 1
2231                        && self.check_for_mapping_ahead()
2232                    {
2233                        self.maybe_open_block_mapping_for_key()?;
2234                    }
2235                    let token = self.scan_anchor()?;
2236                    self.tokens.push(token);
2237                }
2238                '*' => {
2239                    // §6.9.2: alias/anchor names may contain \`:\` (only
2240                    // flow indicators and whitespace terminate them).
2241                    // So \`*a:\` is an alias named \`a:\`, NOT an alias
2242                    // \`*a\` followed by a key separator. Don't open
2243                    // an implicit block mapping in that case (yaml-
2244                    // test-suite 2SXE).
2245                    if self.flow_level == 0
2246                        && self.position.column == self.current_indent + 1
2247                        && self.check_for_mapping_ahead()
2248                        && !self.colon_belongs_to_alias_anchor_name()
2249                    {
2250                        self.maybe_open_block_mapping_for_key()?;
2251                    }
2252                    let token = self.scan_alias()?;
2253                    self.tokens.push(token);
2254                }
2255
2256                // Block scalars
2257                '|' => {
2258                    let token = self.scan_literal_block_scalar()?;
2259                    self.tokens.push(token);
2260                    // Block scalar collection rewinds the cursor to the
2261                    // start of the next under-indented line. `current_indent`
2262                    // is still set to the inline content's column from the
2263                    // enclosing `- |` / `key: |` site, so the next iteration
2264                    // would mis-dispatch. Break out so the outer loop
2265                    // re-enters `process_line` and reruns indent handling
2266                    // (yaml-test-suite 4QFQ, M6YH, P2AD).
2267                    break;
2268                }
2269                '>' => {
2270                    let token = self.scan_folded_block_scalar()?;
2271                    self.tokens.push(token);
2272                    break;
2273                }
2274
2275                // Tags. Same line-start property-opens-mapping rule
2276                // (yaml-test-suite ZH7C variants).
2277                //
2278                // §6.9: a property at the SAME indent as the
2279                // enclosing mapping/sequence cannot apply to that
2280                // collection's value — the value must be more
2281                // indented. If we're at a line-start \`!\` whose column
2282                // equals the enclosing mapping's indent + 1 AND that
2283                // mapping currently has a key awaiting a value, the
2284                // tag is misplaced (yaml-test-suite H7J7).
2285                '!' => {
2286                    if self.flow_level == 0
2287                        && self.position.column == self.current_indent + 1
2288                        && !self.check_for_mapping_ahead()
2289                        && self.indent_stack.len() > 1
2290                        && self.current_indent == self.indent_stack[self.indent_stack.len() - 2]
2291                        && self.most_recent_token_is_value_separator()
2292                    {
2293                        return Err(Error::scan(
2294                            self.position,
2295                            "Tag at line-start with insufficient indent for value position"
2296                                .to_string(),
2297                        ));
2298                    }
2299                    if self.flow_level == 0
2300                        && self.position.column == self.current_indent + 1
2301                        && self.check_for_mapping_ahead()
2302                    {
2303                        self.maybe_open_block_mapping_for_key()?;
2304                    }
2305                    let token = self.scan_tag()?;
2306                    self.tokens.push(token);
2307                }
2308
2309                // Plain scalars
2310                _ if self.is_plain_scalar_start() => {
2311                    // A plain scalar starting on the SAME line as a
2312                    // flow-collection close (\`}\` or \`]\`) means there's
2313                    // no separator between the closed flow node and
2314                    // the new content (yaml-test-suite 62EZ
2315                    // \`x: { y: z }in: valid\`).
2316                    if self.flow_level == 0 {
2317                        if let Some(last) = self.tokens.last() {
2318                            if matches!(
2319                                last.token_type,
2320                                TokenType::FlowMappingEnd | TokenType::FlowSequenceEnd
2321                            ) && last.end_position.line == self.position.line
2322                            {
2323                                return Err(Error::scan(
2324                                    self.position,
2325                                    "Plain scalar immediately after flow collection close"
2326                                        .to_string(),
2327                                ));
2328                            }
2329                        }
2330                    }
2331                    if self.flow_level == 0 && self.check_for_mapping_ahead() {
2332                        self.maybe_open_block_mapping_for_key()?;
2333                    }
2334
2335                    let token = self.scan_plain_scalar()?;
2336                    self.tokens.push(token);
2337                }
2338
2339                _ => {
2340                    let context = ErrorContext::from_input(&self.input, &self.position, 2)
2341                        .with_suggestion("Check for valid YAML syntax characters".to_string());
2342                    return Err(Error::invalid_character_with_context(
2343                        self.position,
2344                        ch,
2345                        "YAML document",
2346                        context,
2347                    ));
2348                }
2349            }
2350        }
2351
2352        // Inline sequences (nested \`- -\` on one line) used to be
2353        // closed unconditionally at end-of-line. But a nested sequence
2354        // can span lines (`- - a\n  - b\n- c`) — in that case the inner
2355        // sequence must remain open until handle_indentation sees a
2356        // dedent. Reset the inline-sequence counter (so the next line
2357        // is judged on its own merits) but DO NOT emit BlockEnd —
2358        // handle_indentation's indent_stack pop, the end-of-stream
2359        // close at scan_next_token, and the explicit-dedent close at
2360        // handle_indentation's bottom each provide a correct close.
2361        self.inline_sequence_depth = 0;
2362
2363        Ok(())
2364    }
2365
2366    /// Scan the next token lazily
2367    fn scan_next_token(&mut self) -> Result<()> {
2368        if self.done {
2369            return Ok(());
2370        }
2371
2372        // Add stream start token if this is the beginning
2373        if self.tokens.is_empty() {
2374            self.tokens
2375                .push(Token::simple(TokenType::StreamStart, self.position));
2376            return Ok(());
2377        }
2378
2379        // Check if we're at the end of input
2380        if self.current_char.is_none() {
2381            if !self
2382                .tokens
2383                .iter()
2384                .any(|t| matches!(t.token_type, TokenType::StreamEnd))
2385            {
2386                self.tokens
2387                    .push(Token::simple(TokenType::StreamEnd, self.position));
2388            }
2389            self.done = true;
2390            return Ok(());
2391        }
2392
2393        // For now, fall back to scanning all tokens at once for the lazy scanner
2394        // This is a simplified implementation - a full streaming parser would
2395        // need more sophisticated state management
2396        let tokens_before = self.tokens.len();
2397        self.scan_all_tokens()?;
2398
2399        // Mark as done after scanning all tokens
2400        if self.tokens.len() == tokens_before {
2401            self.done = true;
2402        }
2403
2404        Ok(())
2405    }
2406
2407    /// Pre-scan all tokens (simplified approach for basic implementation)
2408    fn scan_all_tokens(&mut self) -> Result<()> {
2409        // Only add StreamStart if we don't have it yet
2410        if !self
2411            .tokens
2412            .iter()
2413            .any(|t| matches!(t.token_type, TokenType::StreamStart))
2414        {
2415            self.tokens
2416                .push(Token::simple(TokenType::StreamStart, self.position));
2417        }
2418
2419        while self.current_char.is_some() {
2420            self.process_line()?;
2421
2422            // Advance past newlines
2423            while let Some(ch) = self.current_char {
2424                if ch == '\n' || ch == '\r' {
2425                    self.advance();
2426                } else {
2427                    break;
2428                }
2429            }
2430        }
2431
2432        // Close any remaining compact sequences (before their parent mappings)
2433        while self.compact_sequence_indents.pop().is_some() {
2434            self.tokens
2435                .push(Token::simple(TokenType::BlockEnd, self.position));
2436        }
2437
2438        // Close any remaining blocks
2439        while self.indent_stack.len() > 1 {
2440            self.indent_stack.pop();
2441            self.indent_is_sequence.pop();
2442            self.tokens
2443                .push(Token::simple(TokenType::BlockEnd, self.position));
2444        }
2445
2446        self.tokens
2447            .push(Token::simple(TokenType::StreamEnd, self.position));
2448        self.done = true;
2449        Ok(())
2450    }
2451
2452    /// Peek at a character at the given offset (can be negative)
2453    /// Check if the current position starts a pure number (digits/dots/minus only,
2454    /// not followed by letters). Values like 500m, 128Mi should be treated as plain scalars.
2455    fn is_pure_number(&self) -> bool {
2456        let mut offset: isize = 0;
2457        let first = self.peek_char(0);
2458        // Skip leading minus
2459        if first == Some('-') {
2460            offset = 1;
2461        }
2462        // Scan digits and at most one dot
2463        let mut has_digit = false;
2464        let mut dot_count = 0;
2465        loop {
2466            match self.peek_char(offset) {
2467                Some(c) if c.is_ascii_digit() => {
2468                    has_digit = true;
2469                    offset += 1;
2470                }
2471                Some('.') => {
2472                    dot_count += 1;
2473                    if dot_count > 1 {
2474                        // Multiple dots (e.g. 0.5.8) — not a number
2475                        return false;
2476                    }
2477                    offset += 1;
2478                }
2479                Some(c) if c.is_ascii_alphabetic() || c == '_' => {
2480                    // Letters follow the digits — not a pure number (e.g. 500m, 128Mi)
2481                    return false;
2482                }
2483                Some(c) => {
2484                    // For a token to be a pure number, what follows
2485                    // the digits must be end-of-token. In flow
2486                    // context that's a flow indicator. In block
2487                    // context the rest of the line must be pure
2488                    // whitespace (possibly trailing a comment) — if
2489                    // there's more non-whitespace content on this
2490                    // line, the digits are part of a larger plain
2491                    // scalar like \`1 - 3\` (yaml-test-suite P76L)
2492                    // or \`20:03:20\` (yaml-test-suite U9NS).
2493                    if self.flow_level > 0 && ",[]{}".contains(c) {
2494                        return has_digit;
2495                    }
2496                    if c == '\n' || c == '\r' {
2497                        return has_digit;
2498                    }
2499                    if c == ' ' || c == '\t' {
2500                        // Look ahead: rest of line must be whitespace
2501                        // or a comment.
2502                        let mut probe = offset + 1;
2503                        loop {
2504                            match self.peek_char(probe) {
2505                                None => return has_digit,
2506                                Some('\n' | '\r') => return has_digit,
2507                                Some('#') => return has_digit,
2508                                Some(' ' | '\t') => probe += 1,
2509                                Some(_) => return false,
2510                            }
2511                        }
2512                    }
2513                    if c == ':' {
2514                        let next = self.peek_char(offset + 1);
2515                        return has_digit && next.map_or(true, |nc| nc.is_whitespace());
2516                    }
2517                    return false;
2518                }
2519                None => return has_digit,
2520            }
2521        }
2522    }
2523
2524    fn peek_char(&self, offset: isize) -> Option<char> {
2525        if offset >= 0 {
2526            let target_index = self.current_char_index + offset as usize;
2527            if target_index < self.char_cache.len() {
2528                Some(self.char_cache[target_index])
2529            } else {
2530                None
2531            }
2532        } else {
2533            let offset_magnitude = (-offset) as usize;
2534            if self.current_char_index >= offset_magnitude {
2535                Some(self.char_cache[self.current_char_index - offset_magnitude])
2536            } else {
2537                None
2538            }
2539        }
2540    }
2541
2542    /// Scan an anchor token (&name)
2543    fn scan_anchor(&mut self) -> Result<Token> {
2544        let start_pos = self.position;
2545        self.advance(); // Skip '&'
2546
2547        let name = self.scan_identifier()?;
2548        if name.is_empty() {
2549            let context = ErrorContext::from_input(&self.input, &self.position, 2).with_suggestion(
2550                "Provide a valid anchor name after &, e.g., &anchor_name".to_string(),
2551            );
2552            return Err(Error::scan_with_context(
2553                self.position,
2554                "Anchor name cannot be empty",
2555                context,
2556            ));
2557        }
2558
2559        // Track anchor for resource limits
2560        self.resource_tracker.add_anchor(&self.limits)?;
2561
2562        Ok(Token::new(
2563            TokenType::Anchor(name),
2564            start_pos,
2565            self.position,
2566        ))
2567    }
2568
2569    /// Scan an alias token (*name)
2570    fn scan_alias(&mut self) -> Result<Token> {
2571        let start_pos = self.position;
2572        self.advance(); // Skip '*'
2573
2574        let name = self.scan_identifier()?;
2575        if name.is_empty() {
2576            let context = ErrorContext::from_input(&self.input, &self.position, 2).with_suggestion(
2577                "Provide a valid alias name after *, e.g., *alias_name".to_string(),
2578            );
2579            return Err(Error::scan_with_context(
2580                self.position,
2581                "Alias name cannot be empty",
2582                context,
2583            ));
2584        }
2585
2586        Ok(Token::new(TokenType::Alias(name), start_pos, self.position))
2587    }
2588
2589    /// Scan an identifier (used for anchor and alias names)
2590    fn scan_identifier(&mut self) -> Result<String> {
2591        // Per YAML 1.2 §6.9.2 (ns-anchor-name = ns-anchor-char+), the only
2592        // exclusions are whitespace and the flow indicators `,[]{}`. This
2593        // accepts ASCII alphanumeric, underscore, hyphen, AND full unicode
2594        // codepoints (including emoji), matching the spec exactly.
2595        let mut identifier = String::new();
2596        while let Some(ch) = self.current_char {
2597            if ch.is_whitespace() || matches!(ch, ',' | '[' | ']' | '{' | '}') {
2598                break;
2599            }
2600            identifier.push(ch);
2601            self.advance();
2602        }
2603        Ok(identifier)
2604    }
2605
2606    /// Scan a tag token (`!tag`, `!!tag`, or `!<verbatim>`).
2607    fn scan_tag(&mut self) -> Result<Token> {
2608        let start_pos = self.position;
2609        self.advance(); // Skip first '!'
2610
2611        let mut tag = String::from("!");
2612
2613        // Check for verbatim tag format: !<tag>
2614        if self.current_char == Some('<') {
2615            tag.push('<');
2616            self.advance(); // Skip '<'
2617
2618            // Scan until closing '>'
2619            while let Some(ch) = self.current_char {
2620                if ch == '>' {
2621                    tag.push(ch);
2622                    self.advance();
2623                    break;
2624                } else if ch.is_control() || ch.is_whitespace() {
2625                    return Err(Error::scan(
2626                        self.position,
2627                        "Invalid character in verbatim tag".to_string(),
2628                    ));
2629                }
2630                tag.push(ch);
2631                self.advance();
2632            }
2633        } else {
2634            // Check for secondary tag handle: !!
2635            if self.current_char == Some('!') {
2636                tag.push('!');
2637                self.advance(); // Skip second '!'
2638            }
2639
2640            // Scan tag name/suffix.
2641            //
2642            // Per YAML 1.2 §5.6, tag suffixes are URI references — they may
2643            // contain any URI character (RFC 3986 unreserved + sub-delims +
2644            // a few others) or `%XX` percent-encoded bytes. The handful of
2645            // characters listed below covers the alphanumeric + URI-safe
2646            // punctuation set used by yaml-test-suite. Percent decoding of
2647            // `%XX` happens later in `TagResolver::resolve`.
2648            //
2649            // §5.3: inside a flow collection, the flow indicators
2650            // `,`, `[`, `]`, `{`, `}` always terminate a node — so we
2651            // must NOT consume them into the tag suffix even though
2652            // RFC 3986 permits them in URIs (yaml-test-suite WZ62).
2653            // YAML 1.2 in practice treats `,` as a flow indicator that
2654            // must be percent-encoded (\`%2C\`) when it appears inside
2655            // a tag suffix — bare \`,\` is not allowed in EITHER block
2656            // or flow context (yaml-test-suite U99R).
2657            while let Some(ch) = self.current_char {
2658                if matches!(ch, ',') {
2659                    break;
2660                }
2661                if self.flow_level > 0 && matches!(ch, '[' | ']' | '{' | '}') {
2662                    break;
2663                }
2664                // §6.8 / §5.6: `:` IS a valid tag URI character — e.g.
2665                // `tag:yaml.org,2002:str` legitimately contains two
2666                // colons inside its URI. But a `:` followed by
2667                // whitespace, EOL or EOF is the YAML mapping-value
2668                // indicator and MUST terminate the tag, otherwise
2669                // `!handle!suffix: value` is mis-scanned as
2670                // `Tag("!handle!suffix:") Scalar("value")` and the
2671                // implicit-key mapping structure is lost. Mirrors the
2672                // `,` carve-out above (a valid URI char that's also a
2673                // YAML flow indicator in some contexts).
2674                if ch == ':' {
2675                    match self.peek_char(1) {
2676                        None => break,
2677                        Some(c) if c.is_whitespace() => break,
2678                        _ => {}
2679                    }
2680                }
2681                if ch.is_alphanumeric() || "-._~:/?#[]@!$&'()*+;=%".contains(ch) {
2682                    tag.push(ch);
2683                    self.advance();
2684                } else {
2685                    break;
2686                }
2687            }
2688        }
2689
2690        Ok(Token::new(TokenType::Tag(tag), start_pos, self.position))
2691    }
2692
2693    /// Scan a literal block scalar (|)
2694    fn scan_literal_block_scalar(&mut self) -> Result<Token> {
2695        let start_pos = self.position;
2696        self.advance(); // Skip '|'
2697
2698        // Parse block scalar header (indicators like +, -, explicit indent)
2699        let (chomping, explicit_indent) = self.scan_block_scalar_header()?;
2700
2701        // Skip to next line
2702        self.skip_to_next_line()?;
2703
2704        // Determine indentation. `base_indent` is the surrounding
2705        // block's indent — i.e. the indent of the sequence or
2706        // mapping that contains this scalar. `self.current_indent`
2707        // is sometimes set to the inline indicator column (e.g. 2
2708        // for `- |`), which would make `base_indent + explicit`
2709        // wrong; use the top of `indent_stack` instead
2710        // (yaml-test-suite 4QFQ `|1`).
2711        let base_indent = self.indent_stack.last().copied().unwrap_or(0);
2712        let content_indent = if let Some(explicit) = explicit_indent {
2713            base_indent + explicit
2714        } else {
2715            // Find the first non-empty content line to determine indentation
2716            self.find_block_scalar_indent(base_indent)?
2717        };
2718
2719        // Collect the literal block content
2720        let content = self.collect_literal_block_content(content_indent, chomping)?;
2721
2722        Ok(Token::new(
2723            TokenType::BlockScalarLiteral(content),
2724            start_pos,
2725            self.position,
2726        ))
2727    }
2728
2729    /// Scan a folded block scalar (>)
2730    fn scan_folded_block_scalar(&mut self) -> Result<Token> {
2731        let start_pos = self.position;
2732        self.advance(); // Skip '>'
2733
2734        // Parse block scalar header (indicators like +, -, explicit indent)
2735        let (chomping, explicit_indent) = self.scan_block_scalar_header()?;
2736
2737        // Skip to next line
2738        self.skip_to_next_line()?;
2739
2740        // See scan_literal_block_scalar for why we read `indent_stack`
2741        // rather than `current_indent`.
2742        let base_indent = self.indent_stack.last().copied().unwrap_or(0);
2743        let content_indent = if let Some(explicit) = explicit_indent {
2744            base_indent + explicit
2745        } else {
2746            // Find the first non-empty content line to determine indentation
2747            self.find_block_scalar_indent(base_indent)?
2748        };
2749
2750        // Collect the folded block content
2751        let content = self.collect_folded_block_content(content_indent, chomping)?;
2752
2753        Ok(Token::new(
2754            TokenType::BlockScalarFolded(content),
2755            start_pos,
2756            self.position,
2757        ))
2758    }
2759
2760    /// Parse block scalar header indicators (+, -, and explicit indent)
2761    fn scan_block_scalar_header(&mut self) -> Result<(ChompingMode, Option<usize>)> {
2762        let mut chomping = ChompingMode::Clip;
2763        let mut explicit_indent: Option<usize> = None;
2764        // §6.6: a comment must be preceded by whitespace. \`|#x\` and
2765        // \`>#x\` are invalid (yaml-test-suite X4QW).
2766        let mut seen_separator_ws = false;
2767
2768        // Parse indicators in any order
2769        while let Some(ch) = self.current_char {
2770            match ch {
2771                '+' => {
2772                    chomping = ChompingMode::Keep;
2773                    self.advance();
2774                }
2775                '-' => {
2776                    chomping = ChompingMode::Strip;
2777                    self.advance();
2778                }
2779                '0'..='9' => {
2780                    let digit = ch.to_digit(10).unwrap() as usize;
2781                    if explicit_indent.is_some() {
2782                        let context = ErrorContext::from_input(&self.input, &self.position, 2)
2783                            .with_suggestion(
2784                                "Use only one indent indicator digit in block scalar".to_string(),
2785                            );
2786                        return Err(Error::scan_with_context(
2787                            self.position,
2788                            "Multiple indent indicators in block scalar",
2789                            context,
2790                        ));
2791                    }
2792                    // YAML 1.2 §8.1.1.1: explicit indent indicator is
2793                    // 1..=9. `|0` and `>0` are invalid
2794                    // (yaml-test-suite 2G84/00).
2795                    if digit == 0 {
2796                        let context = ErrorContext::from_input(&self.input, &self.position, 2)
2797                            .with_suggestion(
2798                                "Block-scalar indent indicator must be 1-9".to_string(),
2799                            );
2800                        return Err(Error::scan_with_context(
2801                            self.position,
2802                            "Block-scalar indent indicator `0` is invalid",
2803                            context,
2804                        ));
2805                    }
2806                    explicit_indent = Some(digit);
2807                    self.advance();
2808                }
2809                ' ' | '\t' => {
2810                    seen_separator_ws = true;
2811                    self.advance(); // Skip whitespace
2812                }
2813                '#' => {
2814                    if !seen_separator_ws {
2815                        return Err(Error::scan(
2816                            self.position,
2817                            "Comment in block-scalar header must be preceded by whitespace"
2818                                .to_string(),
2819                        ));
2820                    }
2821                    // Skip comment to end of line
2822                    while let Some(ch) = self.current_char {
2823                        self.advance();
2824                        if ch == '\n' || ch == '\r' {
2825                            break;
2826                        }
2827                    }
2828                    break;
2829                }
2830                '\n' | '\r' => break,
2831                _ => {
2832                    let context = ErrorContext::from_input(&self.input, &self.position, 2)
2833                        .with_suggestion("Use valid block scalar indicators: | (literal), > (folded), + (keep), - (strip), or digit (indent)".to_string());
2834                    return Err(Error::invalid_character_with_context(
2835                        self.position,
2836                        ch,
2837                        "block scalar header",
2838                        context,
2839                    ));
2840                }
2841            }
2842        }
2843
2844        Ok((chomping, explicit_indent))
2845    }
2846
2847    /// Advance the cursor PAST the next line break, but do not consume
2848    /// any leading whitespace on the line that follows. The block-
2849    /// scalar header parser uses this to step from the indicator line
2850    /// to the start of the content line — the next line's leading
2851    /// spaces are part of its content_indent, not header whitespace.
2852    fn skip_to_next_line(&mut self) -> Result<()> {
2853        // If we're already at column 1 (the comment handler in
2854        // scan_block_scalar_header may have already advanced past a
2855        // newline), do nothing — the next line's leading whitespace
2856        // belongs to its content_indent.
2857        if self.position.column == 1 {
2858            return Ok(());
2859        }
2860        while let Some(ch) = self.current_char {
2861            match ch {
2862                '\n' | '\r' => {
2863                    self.advance();
2864                    return Ok(());
2865                }
2866                ' ' | '\t' => {
2867                    self.advance();
2868                }
2869                _ => return Ok(()),
2870            }
2871        }
2872        Ok(())
2873    }
2874
2875    /// Find the content indentation for a block scalar.
2876    ///
2877    /// Per spec §8.1.1.1, indent is the leading-space count of the first
2878    /// non-empty content line (or the longest blank-line indent if no
2879    /// non-empty line exists). A non-empty line whose indent is not
2880    /// strictly deeper than `base_indent` is outside the scalar's
2881    /// scope — that line is a sibling structure, not content
2882    /// (yaml-test-suite K858).
2883    fn find_block_scalar_indent(&mut self, base_indent: usize) -> Result<usize> {
2884        let saved_position = self.position;
2885        let saved_char = self.current_char;
2886        let saved_char_index = self.current_char_index;
2887
2888        let mut max_blank_indent: usize = 0;
2889        let mut found = false;
2890        let mut content_indent: usize = 1;
2891
2892        loop {
2893            let mut line_indent = 0;
2894            while self.current_char == Some(' ') {
2895                line_indent += 1;
2896                self.advance();
2897            }
2898            // §6.1 + §8.1: tabs cannot serve as block-scalar
2899            // indentation. A line that BEGINS with a tab (no leading
2900            // spaces) inside the block scalar's indent search is
2901            // invalid (yaml-test-suite Y79Y/000 \`foo: |\\n\\tbar\`).
2902            // Tabs that appear AFTER one or more spaces are content,
2903            // not indentation, and remain valid (yaml-test-suite
2904            // 96NN/00 \`foo: |-\\n \\tbar\`).
2905            if line_indent == 0 && self.current_char == Some('\t') {
2906                return Err(Error::scan(
2907                    self.position,
2908                    "Tab cannot serve as block-scalar indentation".to_string(),
2909                ));
2910            }
2911
2912            match self.current_char {
2913                None => {
2914                    if line_indent > max_blank_indent {
2915                        max_blank_indent = line_indent;
2916                    }
2917                    break;
2918                }
2919                Some('\n' | '\r') => {
2920                    if line_indent > max_blank_indent {
2921                        max_blank_indent = line_indent;
2922                    }
2923                    self.advance();
2924                    // fall through to next iteration
2925                }
2926                Some(_) => {
2927                    // If we're nested inside another block — either
2928                    // via the `indent_stack` (normal mapping/sequence
2929                    // open) or `compact_sequence_indents` (a
2930                    // compact block sequence at the same indent as
2931                    // its parent) — and this candidate line is not
2932                    // strictly deeper than base_indent, it's a
2933                    // sibling outside the scalar's scope (yaml-test-
2934                    // suite K858, P2AD).
2935                    let inside_block =
2936                        self.indent_stack.len() > 1 || !self.compact_sequence_indents.is_empty();
2937                    if inside_block && line_indent <= base_indent {
2938                        content_indent = max_blank_indent.max(base_indent + 1);
2939                    } else {
2940                        content_indent = line_indent;
2941                    }
2942                    // §8.1.2.1: leading blank lines may not exceed the
2943                    // detected content indent — that ambiguity is
2944                    // invalid (yaml-test-suite W9L4, S98Z).
2945                    if max_blank_indent > content_indent {
2946                        self.position = saved_position;
2947                        self.current_char = saved_char;
2948                        self.current_char_index = saved_char_index;
2949                        return Err(Error::scan(
2950                            self.position,
2951                            "Block scalar leading blank-line indent exceeds content indent"
2952                                .to_string(),
2953                        ));
2954                    }
2955                    found = true;
2956                    break;
2957                }
2958            }
2959        }
2960
2961        if !found {
2962            content_indent = max_blank_indent;
2963        }
2964
2965        self.position = saved_position;
2966        self.current_char = saved_char;
2967        self.current_char_index = saved_char_index;
2968
2969        Ok(content_indent)
2970    }
2971
2972    /// Count indentation at start of current line
2973    fn count_line_indent(&mut self) -> usize {
2974        let mut indent = 0;
2975        let saved_position = self.position;
2976        let saved_char = self.current_char;
2977        let saved_char_index = self.current_char_index;
2978
2979        while let Some(ch) = self.current_char {
2980            if ch == ' ' {
2981                indent += 1;
2982                self.advance();
2983            } else if ch == '\t' {
2984                indent += 8; // Tab counts as 8 spaces
2985                self.advance();
2986            } else {
2987                break;
2988            }
2989        }
2990
2991        // Restore position
2992        self.position = saved_position;
2993        self.current_char = saved_char;
2994        self.current_char_index = saved_char_index;
2995
2996        indent
2997    }
2998
2999    /// Collect content for a literal block scalar.
3000    ///
3001    /// Each line is preserved with its terminating newline. After collection
3002    /// we apply the chomping mode per spec §8.1.1.2.
3003    fn collect_literal_block_content(
3004        &mut self,
3005        content_indent: usize,
3006        chomping: ChompingMode,
3007    ) -> Result<String> {
3008        let mut content = String::new();
3009
3010        loop {
3011            // Count current line's leading-space indent.
3012            let mut line_indent = 0;
3013            let save_pos = self.position;
3014            let save_ch = self.current_char;
3015            let save_idx = self.current_char_index;
3016            while self.current_char == Some(' ') {
3017                line_indent += 1;
3018                self.advance();
3019            }
3020
3021            let line_is_blank = matches!(self.current_char, Some('\n' | '\r') | None);
3022
3023            if !line_is_blank && line_indent < content_indent {
3024                // Non-empty line with less indent ends the scalar; rewind.
3025                self.position = save_pos;
3026                self.current_char = save_ch;
3027                self.current_char_index = save_idx;
3028                break;
3029            }
3030
3031            // Document marker at line start always ends the scalar,
3032            // regardless of content_indent (allows zero-indented
3033            // block scalars per yaml-test-suite FP8R).
3034            if line_indent == 0 && self.is_doc_marker_here() {
3035                self.position = save_pos;
3036                self.current_char = save_ch;
3037                self.current_char_index = save_idx;
3038                break;
3039            }
3040
3041            if line_is_blank {
3042                // A blank line counts when there's an actual line break
3043                // to consume. EOF after we've consumed some whitespace
3044                // on the trailing line ALSO counts as one final blank
3045                // line (yaml-test-suite JEF9/02: `- |+\n        `).
3046                if matches!(self.current_char, Some('\n' | '\r')) {
3047                    // Whitespace beyond content_indent is literal content
3048                    // even on blank lines (yaml-test-suite 6FWR).
3049                    for _ in content_indent..line_indent {
3050                        content.push(' ');
3051                    }
3052                    content.push('\n');
3053                    self.advance();
3054                    continue;
3055                }
3056                if line_indent > 0 {
3057                    for _ in content_indent..line_indent {
3058                        content.push(' ');
3059                    }
3060                    content.push('\n');
3061                }
3062                break;
3063            }
3064
3065            // Content line: we already consumed `line_indent` spaces, but
3066            // only `content_indent` of them belong to indentation. Any
3067            // extra leading spaces are literal content.
3068            let mut line = String::new();
3069            for _ in content_indent..line_indent {
3070                line.push(' ');
3071            }
3072            while let Some(ch) = self.current_char {
3073                if ch == '\n' || ch == '\r' {
3074                    self.advance();
3075                    break;
3076                }
3077                line.push(ch);
3078                self.advance();
3079            }
3080            content.push_str(&line);
3081            content.push('\n');
3082
3083            if self.current_char.is_none() {
3084                break;
3085            }
3086        }
3087
3088        Ok(apply_chomping(content, chomping))
3089    }
3090
3091    /// Check if cursor is at `---` or `...` followed by whitespace/EOL.
3092    fn is_doc_marker_here(&self) -> bool {
3093        let c0 = self.current_char;
3094        let c1 = self.peek_char(1);
3095        let c2 = self.peek_char(2);
3096        let c3 = self.peek_char(3);
3097        let trailing_ok = c3.map_or(true, |c| c.is_whitespace());
3098        (c0 == Some('-') && c1 == Some('-') && c2 == Some('-') && trailing_ok)
3099            || (c0 == Some('.') && c1 == Some('.') && c2 == Some('.') && trailing_ok)
3100    }
3101
3102    /// Collect content for a folded block scalar.
3103    ///
3104    /// Folding rules (§8.1.3): a sequence of single blank lines between
3105    /// equally-indented non-empty content lines collapses into a single
3106    /// space; runs of blank lines emit `n-1` newlines; more-indented
3107    /// lines preserve their newline boundaries. After collection, apply
3108    /// chomping (§8.1.1.2).
3109    fn collect_folded_block_content(
3110        &mut self,
3111        content_indent: usize,
3112        chomping: ChompingMode,
3113    ) -> Result<String> {
3114        #[derive(Clone, Copy, PartialEq, Eq)]
3115        enum LineKind {
3116            Normal,
3117            MoreIndented,
3118            Empty,
3119        }
3120        struct Line {
3121            text: String,
3122            kind: LineKind,
3123        }
3124
3125        let mut lines: Vec<Line> = Vec::new();
3126
3127        loop {
3128            let mut line_indent = 0;
3129            let save_pos = self.position;
3130            let save_ch = self.current_char;
3131            let save_idx = self.current_char_index;
3132            while self.current_char == Some(' ') {
3133                line_indent += 1;
3134                self.advance();
3135            }
3136
3137            let line_is_blank = matches!(self.current_char, Some('\n' | '\r') | None);
3138
3139            if !line_is_blank && line_indent < content_indent {
3140                self.position = save_pos;
3141                self.current_char = save_ch;
3142                self.current_char_index = save_idx;
3143                break;
3144            }
3145
3146            if line_indent == 0 && self.is_doc_marker_here() {
3147                self.position = save_pos;
3148                self.current_char = save_ch;
3149                self.current_char_index = save_idx;
3150                break;
3151            }
3152
3153            if line_is_blank {
3154                if matches!(self.current_char, Some('\n' | '\r')) {
3155                    lines.push(Line {
3156                        text: String::new(),
3157                        kind: LineKind::Empty,
3158                    });
3159                    self.advance();
3160                    continue;
3161                }
3162                break;
3163            }
3164
3165            // Capture extra-indent leading spaces as part of content.
3166            let mut text = String::new();
3167            for _ in content_indent..line_indent {
3168                text.push(' ');
3169            }
3170            while let Some(ch) = self.current_char {
3171                if ch == '\n' || ch == '\r' {
3172                    self.advance();
3173                    break;
3174                }
3175                text.push(ch);
3176                self.advance();
3177            }
3178            // §8.1.3.2: "more indented" means the content (after the
3179            // common indent strip) begins with extra whitespace —
3180            // either spaces or tabs (yaml-test-suite MJS9).
3181            let kind = if text.starts_with(' ') || text.starts_with('\t') {
3182                LineKind::MoreIndented
3183            } else {
3184                LineKind::Normal
3185            };
3186            lines.push(Line { text, kind });
3187
3188            if self.current_char.is_none() {
3189                break;
3190            }
3191        }
3192
3193        // Build the folded output.
3194        let mut content = String::new();
3195        let mut idx = 0;
3196        while idx < lines.len() {
3197            let line = &lines[idx];
3198            match line.kind {
3199                LineKind::Normal | LineKind::MoreIndented => {
3200                    content.push_str(&line.text);
3201                    // Lookahead: count immediately-following empty lines.
3202                    let mut j = idx + 1;
3203                    let mut empties = 0;
3204                    while j < lines.len() && lines[j].kind == LineKind::Empty {
3205                        empties += 1;
3206                        j += 1;
3207                    }
3208                    if j < lines.len() {
3209                        // Spec §8.1.3.2: folding behaviour depends on
3210                        // whether either surrounding content line is
3211                        // "more indented" than the content indent.
3212                        // - both Normal, 0 empties → fold to space.
3213                        // - both Normal, k empties → k newlines (one
3214                        //   break folded out).
3215                        // - any MoreIndented, 0 empties → 1 newline.
3216                        // - any MoreIndented, k empties → k+1 newlines
3217                        //   (every break preserved).
3218                        let mi_adjacent = line.kind == LineKind::MoreIndented
3219                            || lines[j].kind == LineKind::MoreIndented;
3220                        if empties == 0 {
3221                            if mi_adjacent {
3222                                content.push('\n');
3223                            } else {
3224                                content.push(' ');
3225                            }
3226                        } else {
3227                            let breaks = if mi_adjacent { empties + 1 } else { empties };
3228                            for _ in 0..breaks {
3229                                content.push('\n');
3230                            }
3231                        }
3232                        idx = j;
3233                    } else {
3234                        // End of stream after content (possibly trailing empties).
3235                        // Always emit final `\n` for the last content line; extra
3236                        // trailing empties contribute additional `\n`s, and chomping
3237                        // will trim them later if needed.
3238                        content.push('\n');
3239                        for _ in 0..empties {
3240                            content.push('\n');
3241                        }
3242                        break;
3243                    }
3244                }
3245                LineKind::Empty => {
3246                    // Leading empty lines (no preceding content): emit as `\n`s.
3247                    content.push('\n');
3248                    idx += 1;
3249                }
3250            }
3251        }
3252
3253        Ok(apply_chomping(content, chomping))
3254    }
3255
3256    /// Emit a `BlockMappingStart` token if the current position is the
3257    /// start of an implicit key and no mapping is yet active at this
3258    /// indent level. Shared by plain and quoted scalar dispatch.
3259    fn maybe_open_block_mapping_for_key(&mut self) -> Result<()> {
3260        // Use `unwrap_or(0)` for parity with the indentation module's
3261        // helpers — defends against error-recovery pop paths that could
3262        // leave the stack momentarily empty (#18).
3263        let last_indent = self.indent_stack.last().copied().unwrap_or(0);
3264        let should_start_new_mapping = if self.current_indent > last_indent {
3265            true
3266        } else if self.current_indent == last_indent {
3267            !self.check_active_mapping_at_level(self.current_indent)
3268        } else {
3269            false
3270        };
3271        if should_start_new_mapping {
3272            // §6.1 + §8.22: opening a NEW block mapping at deeper
3273            // indent than the parent only makes sense if the parent
3274            // has a key WITHOUT a value (the new mapping IS that
3275            // value). If the parent's last content is a complete
3276            // (key, value) pair — i.e. the most recent meaningful
3277            // token is a value-position scalar/alias/close — then
3278            // there's no node to host the deeper mapping (yaml-test-
3279            // suite U44R: \`map:\\n  key1: q\\n   key2: bad\` — key2
3280            // is deeper than key1 but key1's value is already \`q\`).
3281            if self.current_indent > last_indent && last_indent > 0 {
3282                let mut depth = 0i32;
3283                let mut last_meaningful = None;
3284                for t in self.tokens.iter().rev() {
3285                    match &t.token_type {
3286                        TokenType::BlockEnd => depth += 1,
3287                        TokenType::BlockMappingStart | TokenType::BlockSequenceStart => {
3288                            if depth == 0 {
3289                                break;
3290                            }
3291                            depth -= 1;
3292                        }
3293                        TokenType::Anchor(_) | TokenType::Tag(_) => {}
3294                        other => {
3295                            if depth == 0 {
3296                                last_meaningful = Some(other.clone());
3297                                break;
3298                            }
3299                        }
3300                    }
3301                }
3302                if matches!(
3303                    last_meaningful,
3304                    Some(
3305                        TokenType::Scalar(..)
3306                            | TokenType::Alias(_)
3307                            | TokenType::FlowSequenceEnd
3308                            | TokenType::FlowMappingEnd
3309                            | TokenType::BlockScalarLiteral(..)
3310                            | TokenType::BlockScalarFolded(..)
3311                    )
3312                ) {
3313                    return Err(Error::scan(
3314                        self.position,
3315                        "Indentation increase has no parent in current mapping/sequence"
3316                            .to_string(),
3317                    ));
3318                }
3319            }
3320            self.indent_stack.push(self.current_indent);
3321            self.indent_is_sequence.push(false);
3322            self.resource_tracker
3323                .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
3324            self.tokens
3325                .push(Token::simple(TokenType::BlockMappingStart, self.position));
3326        }
3327        Ok(())
3328    }
3329
3330    /// Look ahead on the current line for a `:` that marks a mapping key.
3331    ///
3332    /// Per YAML 1.2 §7.3.3, a plain scalar may contain a `:` that is not
3333    /// followed by whitespace. Only `: ` terminates the scalar. If the
3334    /// line begins with `"` or `'`, the leading quoted scalar's contents
3335    /// are scanned past (including `''` and `\"` escapes) before looking
3336    /// for the `: ` that would make this scalar a key. This handles
3337    /// yaml-test-suite 6H3V (`'foo: bar\': baz'`) and 6SLA.
3338    /// For an alias/anchor at the current position, scan past
3339    /// the `&`/`*` and the name characters; if the FIRST char that
3340    /// would terminate the name is `:`, the colon is PART of the
3341    /// alias/anchor name (yaml-test-suite 2SXE). Returns true in
3342    /// that case so the caller can skip the implicit-key fast-path.
3343    fn colon_belongs_to_alias_anchor_name(&self) -> bool {
3344        // Start after the `&` / `*` introducer.
3345        let mut i = self.current_char_index + 1;
3346        let n = self.char_cache.len();
3347        // Per scan_identifier rules: stop at whitespace or flow indicator.
3348        while i < n {
3349            let c = self.char_cache[i];
3350            if c.is_whitespace() || matches!(c, ',' | '[' | ']' | '{' | '}') {
3351                break;
3352            }
3353            i += 1;
3354        }
3355        // If the next char (or last consumed?) at termination is `:`,
3356        // then the name ended with `:`. Look at the LAST consumed
3357        // char. Actually our scan_identifier accepts `:` as part of
3358        // name — so the colon is already in the name. There's no
3359        // separate "value indicator" colon after.
3360        //
3361        // For the implicit-key fast path to be wrong, we need the
3362        // name to END with `:` (last char of name is `:`).
3363        if i > self.current_char_index + 1 {
3364            let last_name_char = self.char_cache[i - 1];
3365            if last_name_char == ':' {
3366                return true;
3367            }
3368        }
3369        false
3370    }
3371
3372    /// Scan ahead on the current line (the rest of the post-indent
3373    /// content) to determine whether it looks like an implicit
3374    /// mapping key — i.e. has a `: ` separator (or `:` at line end)
3375    /// before any newline.
3376    fn line_after_indent_is_implicit_key(&self) -> bool {
3377        let mut i = self.current_char_index;
3378        let n = self.char_cache.len();
3379        while i < n {
3380            let ch = self.char_cache[i];
3381            if ch == '\n' || ch == '\r' {
3382                return false;
3383            }
3384            if ch == ':' {
3385                let next = self.char_cache.get(i + 1).copied();
3386                if next.is_none() || next.map_or(false, |c| c.is_whitespace()) {
3387                    return true;
3388                }
3389            }
3390            i += 1;
3391        }
3392        false
3393    }
3394
3395    /// Walk back through recent tokens; if the last non-property
3396    /// token was `Value` (`:`), the parser is in value-expectation
3397    /// mode (key not yet matched with a value).
3398    fn most_recent_token_is_value_separator(&self) -> bool {
3399        for t in self.tokens.iter().rev() {
3400            match t.token_type {
3401                TokenType::Anchor(_) | TokenType::Tag(_) => {}
3402                TokenType::Value => return true,
3403                _ => return false,
3404            }
3405        }
3406        false
3407    }
3408
3409    fn check_for_mapping_ahead(&self) -> bool {
3410        let mut i = self.current_char_index;
3411        let n = self.char_cache.len();
3412        if i < n {
3413            let first = self.char_cache[i];
3414            if first == '\'' || first == '"' {
3415                let quote = first;
3416                i += 1;
3417                while i < n {
3418                    let c = self.char_cache[i];
3419                    if c == '\n' || c == '\r' {
3420                        return false; // unterminated quote on line
3421                    }
3422                    if quote == '\'' && c == '\'' && self.char_cache.get(i + 1) == Some(&'\'') {
3423                        // `''` is the in-string single-quote escape.
3424                        i += 2;
3425                        continue;
3426                    }
3427                    if quote == '"' && c == '\\' {
3428                        // Skip the escaped char.
3429                        i += 2;
3430                        continue;
3431                    }
3432                    if c == quote {
3433                        i += 1;
3434                        break;
3435                    }
3436                    i += 1;
3437                }
3438            }
3439        }
3440        // Skip balanced flow collections — a `:` *inside* `[...]` or
3441        // `{...}` does NOT make the line a block-mapping key (the flow
3442        // collection itself can BE the key, but its inner colons are
3443        // part of its own structure). yaml-test-suite: `{key: v}` is
3444        // a standalone flow mapping; `[a]: outer` is a block-map key.
3445        let mut flow_depth: i32 = 0;
3446        while i < n {
3447            let ch = self.char_cache[i];
3448            match ch {
3449                '\n' | '\r' => return false,
3450                '[' | '{' => flow_depth += 1,
3451                ']' | '}' => flow_depth -= 1,
3452                ':' if flow_depth <= 0 => {
3453                    let next = self.char_cache.get(i + 1).copied();
3454                    match next {
3455                        None => return true,
3456                        Some(c) if c.is_whitespace() => return true,
3457                        _ => {}
3458                    }
3459                }
3460                _ => {}
3461            }
3462            i += 1;
3463        }
3464        false
3465    }
3466
3467    /// Check if there's an active mapping at the specified indentation level
3468    /// This method properly handles BlockEnd tokens by tracking mapping start/end pairs
3469    fn check_active_mapping_at_level(&self, _target_indent: usize) -> bool {
3470        let mut depth = 0;
3471
3472        // Walk backwards through tokens to find the innermost unmatched block start.
3473        // Every BlockEnd increments depth; BlockMappingStart and BlockSequenceStart
3474        // decrement it (both open blocks that need a matching BlockEnd).
3475        // When depth == 0 we have found the block start that is still "open".
3476        for token in self.tokens.iter().rev() {
3477            match &token.token_type {
3478                TokenType::BlockMappingStart => {
3479                    if depth == 0 {
3480                        // The innermost open block is a mapping — active at this level.
3481                        return true;
3482                    }
3483                    depth -= 1;
3484                }
3485                TokenType::BlockSequenceStart => {
3486                    if depth == 0 {
3487                        // The innermost open block is a sequence, not a mapping.
3488                        return false;
3489                    }
3490                    depth -= 1;
3491                }
3492                TokenType::BlockEnd => {
3493                    depth += 1;
3494                }
3495                TokenType::StreamStart | TokenType::DocumentStart | TokenType::DocumentEnd => {
3496                    // Stop at document boundaries
3497                    break;
3498                }
3499                _ => {}
3500            }
3501        }
3502
3503        false
3504    }
3505}
3506
3507impl Scanner for BasicScanner {
3508    fn check_token(&self) -> bool {
3509        // For lazy scanning: check if we have cached tokens or can generate more
3510        self.token_index < self.tokens.len() || !self.done
3511    }
3512
3513    fn peek_token(&self) -> Result<Option<&Token>> {
3514        // This is a bit tricky with lazy scanning since peek shouldn't mutate
3515        // For now, return cached token if available
3516        Ok(self.tokens.get(self.token_index))
3517    }
3518
3519    fn get_token(&mut self) -> Result<Option<Token>> {
3520        // If we need more tokens and haven't finished, scan next token
3521        if self.token_index >= self.tokens.len() && !self.done {
3522            self.scan_next_token()?;
3523        }
3524
3525        if self.token_index < self.tokens.len() {
3526            let token = self.tokens[self.token_index].clone();
3527            self.token_index += 1;
3528            Ok(Some(token))
3529        } else {
3530            Ok(None)
3531        }
3532    }
3533
3534    fn reset(&mut self) {
3535        self.token_index = 0;
3536        self.position = Position::start();
3537        self.tokens.clear();
3538        self.done = false;
3539        self.current_char = self.input.chars().next();
3540        self.indent_stack = vec![0];
3541        self.current_indent = 0;
3542        self.flow_level = 0;
3543        self.detected_indent_style = None;
3544        self.indent_samples.clear();
3545        self.previous_indent_level = 0;
3546        self.current_char_index = 0;
3547        self.current_char = self.char_cache.first().copied();
3548    }
3549
3550    fn position(&self) -> Position {
3551        self.position
3552    }
3553
3554    fn input(&self) -> &str {
3555        &self.input
3556    }
3557}
3558
3559#[cfg(test)]
3560mod tests {
3561    use super::*;
3562
3563    /// Regression for #19. Reaching this constructor with malformed input
3564    /// must record the scanning error so callers can detect failure via
3565    /// `has_scanning_error()`. Previously the result of `scan_all_tokens`
3566    /// was dropped, silently truncating the token stream.
3567    #[test]
3568    fn new_eager_with_comments_propagates_scanning_errors() {
3569        // A doc-start marker inside an unterminated quoted scalar is a
3570        // scanning error (see `Error::scan(... "inside quoted scalar")`).
3571        // First confirm the non-comment constructor reports it — that
3572        // anchors the parity check.
3573        let input = "\"abc\n---\n";
3574        let plain = BasicScanner::new_eager(input.to_string());
3575        assert!(
3576            plain.has_scanning_error(),
3577            "precondition: malformed input must produce a scanning error via new_eager"
3578        );
3579
3580        let with_comments = BasicScanner::new_eager_with_comments(input.to_string());
3581        assert!(
3582            with_comments.has_scanning_error(),
3583            "new_eager_with_comments must NOT silently swallow scanner errors"
3584        );
3585    }
3586
3587    /// Drive the parser pipeline on `input` in a dedicated thread, returning
3588    /// `None` if it doesn't finish within `Duration::from_secs(2)`. Used by
3589    /// regression tests for parser hangs so a still-broken parser doesn't
3590    /// block the whole `cargo test` run.
3591    fn parse_with_timeout(input: &str) -> Option<Vec<crate::parser::Event>> {
3592        use crate::parser::{BasicParser, Parser as ParserTrait};
3593        use std::sync::mpsc;
3594        use std::thread;
3595        use std::time::Duration;
3596
3597        let owned = input.to_string();
3598        let (tx, rx) = mpsc::channel();
3599        thread::spawn(move || {
3600            let mut p = BasicParser::new_eager(owned);
3601            let _ = p.take_scanning_error();
3602            let mut events = Vec::new();
3603            loop {
3604                match p.get_event() {
3605                    Ok(Some(ev)) => events.push(ev),
3606                    Ok(None) => break,
3607                    Err(_) => break,
3608                }
3609            }
3610            let _ = tx.send(events);
3611        });
3612        rx.recv_timeout(Duration::from_secs(2)).ok()
3613    }
3614
3615    /// Regression: `---` directly followed by non-space text used to spin the
3616    /// scanner forever because the `-` match arm at line-start dispatched to
3617    /// `scan_document_start` (which correctly returned None) and then to
3618    /// `is_plain_scalar_start` (which returns false for `-`, so no consumption
3619    /// occurred — outer `while let` re-entered with the same char). Fix:
3620    /// fall through to `scan_plain_scalar` unconditionally when not a doc
3621    /// marker — the guard already ensures the char is non-whitespace.
3622    /// See yaml-test-suite tests 82AN / EXG3.
3623    #[test]
3624    fn three_dashes_directly_followed_by_text_does_not_hang() {
3625        let events = parse_with_timeout("---word1\nword2\n")
3626            .expect("parser hung — `---word1` should not produce an infinite loop");
3627        // We must produce at least one scalar whose value starts with `---`,
3628        // proving that the dashes were consumed as part of a plain scalar
3629        // (not interpreted as a document marker, which would consume them
3630        // separately).
3631        let starts_with_dashes = events.iter().any(|e| {
3632            matches!(&e.event_type,
3633                crate::parser::EventType::Scalar { value, .. } if value.starts_with("---")
3634            )
3635        });
3636        assert!(
3637            starts_with_dashes,
3638            "expected a plain scalar starting with `---`, got events: {events:?}"
3639        );
3640    }
3641
3642    /// YAML 1.2 §7.3.3: `?`, `:`, and `-` may start a plain scalar provided
3643    /// the next character is non-space (and, in flow context, not a flow
3644    /// indicator). The previous `is_plain_scalar_start` unconditionally
3645    /// rejected those three characters, so plain scalars like `?foo`,
3646    /// `:foo`, `-foo` were reported as `Invalid character`.
3647    /// Tracked by yaml-test-suite 2EBW.
3648    #[test]
3649    fn question_mark_followed_by_text_starts_plain_scalar() {
3650        use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3651        let mut p = BasicParser::new_eager("?foo: bar\n".to_string());
3652        assert!(p.take_scanning_error().is_none());
3653        let mut keys = Vec::new();
3654        while let Ok(Some(ev)) = p.get_event() {
3655            if let EventType::Scalar { value, .. } = ev.event_type {
3656                keys.push(value);
3657            }
3658        }
3659        assert_eq!(keys, vec!["?foo", "bar"]);
3660    }
3661
3662    #[test]
3663    fn colon_followed_by_text_starts_plain_scalar() {
3664        use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3665        let mut p = BasicParser::new_eager(":foo: bar\n".to_string());
3666        assert!(p.take_scanning_error().is_none());
3667        let mut keys = Vec::new();
3668        while let Ok(Some(ev)) = p.get_event() {
3669            if let EventType::Scalar { value, .. } = ev.event_type {
3670                keys.push(value);
3671            }
3672        }
3673        assert_eq!(keys, vec![":foo", "bar"]);
3674    }
3675
3676    /// YAML 1.2: every started document must be closed with a DocumentEnd
3677    /// event before StreamEnd. The previous `TokenType::StreamEnd` handler
3678    /// only emitted `-DOC` for `DocumentContent` / `BlockNode` states —
3679    /// the `DocumentStart` state (entered after `---` and a single scalar
3680    /// like `"foo"`) was skipped, dropping the `-DOC` event. Affected by
3681    /// yaml-test-suite 27NA, 2G84/*, 2LFX and several others.
3682    #[test]
3683    fn explicit_doc_with_only_a_scalar_emits_doc_end_before_stream_end() {
3684        use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3685        let mut p = BasicParser::new_eager("---\n\"foo\"\n".to_string());
3686        assert!(p.take_scanning_error().is_none());
3687        let mut kinds = Vec::new();
3688        while let Ok(Some(ev)) = p.get_event() {
3689            kinds.push(match ev.event_type {
3690                EventType::StreamStart => "+STR",
3691                EventType::StreamEnd => "-STR",
3692                EventType::DocumentStart { .. } => "+DOC",
3693                EventType::DocumentEnd { .. } => "-DOC",
3694                EventType::Scalar { .. } => "=VAL",
3695                _ => "?",
3696            });
3697        }
3698        // Critical: -DOC must come before -STR.
3699        let doc_end_idx = kinds.iter().position(|s| *s == "-DOC");
3700        let str_end_idx = kinds.iter().position(|s| *s == "-STR");
3701        assert!(
3702            doc_end_idx.is_some(),
3703            "missing -DOC in event stream: {kinds:?}"
3704        );
3705        assert!(
3706            doc_end_idx < str_end_idx,
3707            "expected -DOC before -STR, got {kinds:?}"
3708        );
3709    }
3710
3711    /// YAML 1.2 §5.7 hex / Unicode escapes in double-quoted strings.
3712    #[test]
3713    fn double_quoted_hex_escapes_decode_to_codepoint() {
3714        use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3715        for (input, expected) in [
3716            (r#""\x41""#, "A"),
3717            (r#""é""#, "é"),
3718            (r#""\U0001F600""#, "\u{1f600}"),
3719        ] {
3720            let mut p = BasicParser::new_eager(input.to_string());
3721            assert!(
3722                p.take_scanning_error().is_none(),
3723                "no scan error for {input}"
3724            );
3725            let mut found = None;
3726            while let Ok(Some(ev)) = p.get_event() {
3727                if let EventType::Scalar { value, .. } = ev.event_type {
3728                    found = Some(value);
3729                    break;
3730                }
3731            }
3732            assert_eq!(found.as_deref(), Some(expected), "input {input}");
3733        }
3734    }
3735
3736    #[test]
3737    fn truncated_hex_escape_is_a_scan_error() {
3738        use crate::parser::BasicParser;
3739        let mut p = BasicParser::new_eager(r#""\x4""#.to_string());
3740        assert!(
3741            p.take_scanning_error().is_some(),
3742            "truncated \\x escape must error"
3743        );
3744    }
3745
3746    /// YAML 1.2 §5.7: double-quoted strings have a strict allowlist of escape
3747    /// sequences. `\.` (and any other unknown escape) must be reported as a
3748    /// scan error. Tracked by yaml-test-suite 55WF.
3749    #[test]
3750    fn invalid_double_quoted_escape_is_a_scan_error() {
3751        use crate::parser::{BasicParser, Parser as ParserTrait};
3752        let mut p = BasicParser::new_eager("---\n\"\\.\"\n".to_string());
3753        let scan_err = p.take_scanning_error();
3754        let mut parse_err = false;
3755        if scan_err.is_none() {
3756            loop {
3757                match p.get_event() {
3758                    Ok(Some(_)) => {}
3759                    Ok(None) => break,
3760                    Err(_) => {
3761                        parse_err = true;
3762                        break;
3763                    }
3764                }
3765            }
3766        }
3767        assert!(
3768            scan_err.is_some() || parse_err,
3769            "`\\.` is not a valid double-quoted escape and must error"
3770        );
3771    }
3772
3773    /// YAML 1.2: a complex-key marker (`?`) is the first content after an
3774    /// explicit document start (`---`) — it should open an implicit block
3775    /// mapping. The previous parser handled `?` only in
3776    /// `ImplicitDocumentStart` / `DocumentContent` / already-in-mapping
3777    /// states and errored out for `DocumentStart`, breaking inputs like
3778    /// `--- !!set\n? Mark McGwire\n...`. Tracked by yaml-test-suite 2XXW.
3779    #[test]
3780    fn complex_key_directly_after_explicit_doc_start_opens_mapping() {
3781        use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3782        let mut p = BasicParser::new_eager("--- !!set\n? Mark McGwire\n? Sammy Sosa\n".to_string());
3783        assert!(p.take_scanning_error().is_none());
3784        let mut saw_map_start = false;
3785        let mut saw_error = false;
3786        loop {
3787            match p.get_event() {
3788                Ok(Some(ev)) => {
3789                    if matches!(ev.event_type, EventType::MappingStart { .. }) {
3790                        saw_map_start = true;
3791                    }
3792                }
3793                Ok(None) => break,
3794                Err(_) => {
3795                    saw_error = true;
3796                    break;
3797                }
3798            }
3799        }
3800        assert!(!saw_error, "complex key after `--- !!set` must not error");
3801        assert!(saw_map_start, "expected a MappingStart event");
3802    }
3803
3804    /// YAML 1.2 §6.9.2: anchor / alias names exclude only whitespace and
3805    /// the flow indicators `,[]{}`. Earlier implementations restricted
3806    /// `scan_identifier` to ASCII alphanumeric / `_` / `-`, which rejected
3807    /// valid unicode anchors like `&😁`. Tracked by yaml-test-suite 8XYN.
3808    #[test]
3809    fn anchor_name_may_contain_unicode_symbols() {
3810        use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3811        let mut p = BasicParser::new_eager("---\n- &😁 unicode anchor\n".to_string());
3812        assert!(
3813            p.take_scanning_error().is_none(),
3814            "unicode anchor must not error"
3815        );
3816        let mut anchors = Vec::new();
3817        while let Ok(Some(ev)) = p.get_event() {
3818            if let EventType::Scalar {
3819                anchor: Some(a), ..
3820            } = ev.event_type
3821            {
3822                anchors.push(a);
3823            }
3824        }
3825        assert_eq!(anchors, vec!["😁"]);
3826    }
3827
3828    /// YAML 1.2 §5.6 / RFC 3986 percent-encoding: tag suffixes may contain
3829    /// `%XX` percent-escaped characters, which must be URI-decoded when
3830    /// resolved. The scanner used to reject `%` in tag suffixes as
3831    /// "Invalid character", so e.g. `!e!tag%21 baz` failed before the
3832    /// resolver got a chance to decode it. Tracked by yaml-test-suite 6CK3.
3833    #[test]
3834    fn tag_suffix_with_percent_escape_resolves_to_decoded_uri() {
3835        use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3836        let mut p = BasicParser::new_eager(
3837            "%TAG !e! tag:example.com,2000:app/\n---\n- !e!tag%21 baz\n".to_string(),
3838        );
3839        assert!(
3840            p.take_scanning_error().is_none(),
3841            "tag percent-escapes must not error"
3842        );
3843        let mut tags = Vec::new();
3844        while let Ok(Some(ev)) = p.get_event() {
3845            if let EventType::Scalar { tag: Some(t), .. } = ev.event_type {
3846                tags.push(t);
3847            }
3848        }
3849        assert_eq!(tags, vec!["tag:example.com,2000:app/tag!"]);
3850    }
3851
3852    /// YAML 1.2 §6.8.4: "A YAML processor should ignore any directive it
3853    /// does not recognize." A `%FOO` reserved directive must NOT be treated
3854    /// as a scan error — the directive line is silently skipped and parsing
3855    /// continues. Tracked by yaml-test-suite test 2LFX.
3856    #[test]
3857    fn reserved_directive_is_ignored_not_an_error() {
3858        use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3859        let mut p = BasicParser::new_eager(
3860            "%FOO  bar baz # Should be ignored\n              # with a warning.\n---\n\"foo\"\n"
3861                .to_string(),
3862        );
3863        assert!(
3864            p.take_scanning_error().is_none(),
3865            "unknown directives must NOT produce a scan error"
3866        );
3867        let mut scalars = Vec::new();
3868        while let Ok(Some(ev)) = p.get_event() {
3869            if let EventType::Scalar { value, .. } = ev.event_type {
3870                scalars.push(value);
3871            }
3872        }
3873        assert_eq!(scalars, vec!["foo"]);
3874    }
3875
3876    /// Spec requires the two physical lines of `---word1\nword2` to fold into
3877    /// a single plain scalar `"---word1 word2"`. Tracked by yaml-test-suite 82AN.
3878    #[test]
3879    fn three_dashes_followed_by_text_folds_continuation_line() {
3880        let events = parse_with_timeout("---word1\nword2\n").expect("parser hung");
3881        let scalars: Vec<&str> = events
3882            .iter()
3883            .filter_map(|e| match &e.event_type {
3884                crate::parser::EventType::Scalar { value, .. } => Some(value.as_str()),
3885                _ => None,
3886            })
3887            .collect();
3888        assert_eq!(scalars, vec!["---word1 word2"]);
3889    }
3890
3891    /// Regression: tab between block-entry marker and a `-N` value used to
3892    /// hang the scanner via the same `-` match arm. See yaml-test-suite
3893    /// Y79Y/010.
3894    #[test]
3895    fn dash_tab_negative_number_does_not_hang() {
3896        let events = parse_with_timeout("-\t-1\n")
3897            .expect("parser hung — `-\\t-1` should not produce an infinite loop");
3898        assert!(!events.is_empty(), "expected event stream, got none");
3899    }
3900
3901    #[test]
3902    fn test_basic_tokenization() {
3903        let mut scanner = BasicScanner::new("42".to_string());
3904
3905        assert!(scanner.check_token());
3906
3907        // StreamStart
3908        let token = scanner.get_token().unwrap().unwrap();
3909        assert!(matches!(token.token_type, TokenType::StreamStart));
3910
3911        // Number
3912        let token = scanner.get_token().unwrap().unwrap();
3913        if let TokenType::Scalar(value, _) = token.token_type {
3914            assert_eq!(value, "42");
3915        } else {
3916            panic!("Expected scalar token");
3917        }
3918
3919        // StreamEnd
3920        let token = scanner.get_token().unwrap().unwrap();
3921        assert!(matches!(token.token_type, TokenType::StreamEnd));
3922    }
3923
3924    #[test]
3925    fn test_flow_sequence() {
3926        let mut scanner = BasicScanner::new("[1, 2, 3]".to_string());
3927
3928        // StreamStart
3929        scanner.get_token().unwrap();
3930
3931        // [
3932        let token = scanner.get_token().unwrap().unwrap();
3933        assert!(matches!(token.token_type, TokenType::FlowSequenceStart));
3934
3935        // 1
3936        let token = scanner.get_token().unwrap().unwrap();
3937        if let TokenType::Scalar(value, _) = token.token_type {
3938            assert_eq!(value, "1");
3939        }
3940
3941        // ,
3942        let token = scanner.get_token().unwrap().unwrap();
3943        assert!(matches!(token.token_type, TokenType::FlowEntry));
3944    }
3945
3946    #[test]
3947    fn test_quoted_strings() {
3948        let mut scanner = BasicScanner::new(r#""hello world""#.to_string());
3949
3950        // StreamStart
3951        scanner.get_token().unwrap();
3952
3953        // Quoted string
3954        let token = scanner.get_token().unwrap().unwrap();
3955        if let TokenType::Scalar(value, _) = token.token_type {
3956            assert_eq!(value, "hello world");
3957        } else {
3958            panic!("Expected scalar token");
3959        }
3960    }
3961
3962    #[test]
3963    fn test_comment_handling() {
3964        let input = r"
3965# Full line comment
3966key: value  # End of line comment
3967# Another comment
3968data: test
3969";
3970        let mut scanner = BasicScanner::new(input.to_string());
3971
3972        let mut tokens = Vec::new();
3973        while let Ok(Some(token)) = scanner.get_token() {
3974            tokens.push(token);
3975        }
3976
3977        // Should only contain YAML structure tokens, no comment tokens
3978        let scalar_values: Vec<String> = tokens
3979            .iter()
3980            .filter_map(|t| match &t.token_type {
3981                TokenType::Scalar(s, _) => Some(s.clone()),
3982                _ => None,
3983            })
3984            .collect();
3985
3986        assert_eq!(scalar_values, vec!["key", "value", "data", "test"]);
3987
3988        // Should not contain any comment tokens
3989        assert!(
3990            !tokens
3991                .iter()
3992                .any(|t| matches!(t.token_type, TokenType::Comment(_)))
3993        );
3994    }
3995
3996    #[test]
3997    fn test_hash_in_strings() {
3998        let input = r#"
3999string1: "This has a # character"
4000string2: 'Also has # character'
4001normal: value # This is a comment
4002"#;
4003        let mut scanner = BasicScanner::new(input.to_string());
4004
4005        let mut scalar_values = Vec::new();
4006        while let Ok(Some(token)) = scanner.get_token() {
4007            if let TokenType::Scalar(value, _) = token.token_type {
4008                scalar_values.push(value);
4009            }
4010        }
4011
4012        assert!(scalar_values.contains(&"This has a # character".to_string()));
4013        assert!(scalar_values.contains(&"Also has # character".to_string()));
4014        assert!(scalar_values.contains(&"value".to_string()));
4015        assert!(
4016            !scalar_values
4017                .iter()
4018                .any(|s| s.contains("This is a comment"))
4019        );
4020    }
4021
4022    #[test]
4023    fn test_escape_sequences() {
4024        // YAML 1.2 §5.7 double-quoted escape sequences. Single-quoted strings
4025        // have NO backslash escapes — `''` is the only escape — so this set
4026        // is restricted to the double-quoted cases.
4027        let test_cases = vec![
4028            (r#""Line 1\nLine 2""#, "Line 1\nLine 2"),
4029            (r#""Col1\tCol2""#, "Col1\tCol2"),
4030            (r#""First\rSecond""#, "First\rSecond"),
4031            (r#""Path\\to\\file""#, "Path\\to\\file"),
4032            (r#""He said \"Hello\"""#, "He said \"Hello\""),
4033        ];
4034
4035        for (input, expected) in test_cases {
4036            let mut scanner = BasicScanner::new(input.to_string());
4037            scanner.get_token().unwrap(); // Skip StreamStart
4038
4039            if let Ok(Some(token)) = scanner.get_token() {
4040                if let TokenType::Scalar(value, _) = token.token_type {
4041                    assert_eq!(value, expected, "Failed for input: {}", input);
4042                } else {
4043                    panic!("Expected scalar token for input: {}", input);
4044                }
4045            } else {
4046                panic!("Failed to get token for input: {}", input);
4047            }
4048        }
4049    }
4050
4051    #[test]
4052    fn test_extended_yaml_escapes() {
4053        // Test additional YAML escape sequences
4054        let test_cases = vec![
4055            (r#""\0""#, "\0"),   // null character
4056            (r#""\a""#, "\x07"), // bell
4057            (r#""\b""#, "\x08"), // backspace
4058            (r#""\f""#, "\x0C"), // form feed
4059            (r#""\v""#, "\x0B"), // vertical tab
4060            (r#""\e""#, "\x1B"), // escape
4061            (r#""\ ""#, " "),    // literal space
4062            (r#""\/""#, "/"),    // literal forward slash
4063        ];
4064
4065        for (input, expected) in test_cases {
4066            let mut scanner = BasicScanner::new(input.to_string());
4067            scanner.get_token().unwrap(); // Skip StreamStart
4068
4069            if let Ok(Some(token)) = scanner.get_token() {
4070                if let TokenType::Scalar(value, _) = token.token_type {
4071                    assert_eq!(value, expected, "Failed for input: {}", input);
4072                } else {
4073                    panic!("Expected scalar token for input: {}", input);
4074                }
4075            } else {
4076                panic!("Failed to get token for input: {}", input);
4077            }
4078        }
4079    }
4080
4081    #[test]
4082    fn test_unknown_escape_sequences() {
4083        // YAML 1.2 §5.7: unknown double-quoted escapes are scan errors, not
4084        // preserved literals. (Earlier versions of this scanner kept the
4085        // backslash + char verbatim — see commit history.)
4086        for input in [r#""\z""#, r#""\q""#, r#""\8""#] {
4087            let mut scanner = BasicScanner::new(input.to_string());
4088            scanner.get_token().unwrap(); // StreamStart
4089            assert!(
4090                scanner.get_token().is_err(),
4091                "expected scan error for invalid escape in {input}"
4092            );
4093        }
4094    }
4095}