Skip to main content

rust_yaml/scanner/
mod.rs

1//! YAML scanner for tokenization
2
3use crate::{Error, Limits, Position, ResourceTracker, Result, error::ErrorContext};
4
5pub mod indentation;
6pub mod scalar_scanner;
7pub mod state;
8pub mod token_processor;
9pub mod tokens;
10// pub mod optimizations; // Temporarily disabled
11pub use scalar_scanner::ScalarScanner;
12pub use tokens::*;
13// pub use optimizations::*;
14
15/// Trait for YAML scanners that convert character streams to tokens
16pub trait Scanner {
17    /// Check if there are more tokens available
18    fn check_token(&self) -> bool;
19
20    /// Peek at the next token without consuming it
21    fn peek_token(&self) -> Result<Option<&Token>>;
22
23    /// Get the next token, consuming it
24    fn get_token(&mut self) -> Result<Option<Token>>;
25
26    /// Reset the scanner state
27    fn reset(&mut self);
28
29    /// Get the current position in the input
30    fn position(&self) -> Position;
31
32    /// Get the input text for error reporting
33    fn input(&self) -> &str;
34}
35
36/// Block-scalar chomping mode per YAML 1.2 §8.1.1.2.
37///
38/// - `Strip` (`-`): drop the final line break and trailing empty lines.
39/// - `Clip` (default): keep exactly one final line break, drop trailing empty lines.
40/// - `Keep` (`+`): preserve the final line break and all trailing empty lines.
41#[derive(Debug, Clone, Copy, PartialEq, Eq)]
42enum ChompingMode {
43    Strip,
44    Clip,
45    Keep,
46}
47
48/// Apply chomping mode to a block-scalar tail.
49///
50/// The collectors emit a `\n` for every line (content or blank). This helper
51/// trims that tail according to spec §8.1.1.2:
52///
53/// - **Strip:** remove every trailing `\n`.
54/// - **Clip:** keep exactly one trailing `\n` if content exists; drop the rest.
55///   Empty input stays empty.
56/// - **Keep:** preserve everything.
57fn apply_chomping(mut s: String, mode: ChompingMode) -> String {
58    match mode {
59        ChompingMode::Keep => s,
60        ChompingMode::Strip => {
61            while s.ends_with('\n') {
62                s.pop();
63            }
64            s
65        }
66        ChompingMode::Clip => {
67            // Strip trailing newlines. If anything remains, restore one.
68            // §8.1.1.2: clip keeps the final line break only when the
69            // scalar has actual content (yaml-test-suite K858: an empty
70            // clip scalar `>` is `""`, not `"\n"`).
71            while s.ends_with('\n') {
72                s.pop();
73            }
74            if !s.is_empty() {
75                s.push('\n');
76            }
77            s
78        }
79    }
80}
81
82/// A basic scanner implementation for YAML tokenization
83#[derive(Debug)]
84#[allow(dead_code)]
85pub struct BasicScanner {
86    input: String,
87    position: Position,
88    current_char: Option<char>,
89    tokens: Vec<Token>,
90    token_index: usize,
91    done: bool,
92    indent_stack: Vec<usize>,
93    current_indent: usize,
94    allow_simple_key: bool,
95    simple_key_allowed: bool,
96    flow_level: usize,
97    preserve_comments: bool,
98    // Indentation style detection
99    detected_indent_style: Option<crate::value::IndentStyle>,
100    indent_samples: Vec<(usize, bool)>, // (size, is_tabs)
101    previous_indent_level: usize,       // Track the previous indentation for style detection
102    // Performance optimizations
103    buffer: String,            // Reusable string buffer for token values
104    char_cache: Vec<char>,     // Cached characters for faster access
105    current_char_index: usize, // Current index in char_cache
106    profiler: Option<crate::profiling::YamlProfiler>, // Optional profiling
107    // Error tracking
108    scanning_error: Option<Error>, // Store scanning errors for later retrieval
109    // Resource tracking
110    limits: Limits,
111    resource_tracker: ResourceTracker,
112    // Track inline nested sequences that need closing
113    inline_sequence_depth: usize,
114    // Track compact-notation sequences (where `-` is at the same indent as
115    // the parent mapping keys). These are NOT on indent_stack, so we need
116    // separate tracking to know when to emit BlockEnd for them.
117    compact_sequence_indents: Vec<usize>,
118    // Parallel to indent_stack: true when the entry was pushed by a block
119    // sequence, false when by a mapping. Lets us distinguish "continuing a
120    // regular sequence" from "starting a compact sequence at same indent".
121    indent_is_sequence: Vec<bool>,
122}
123
124impl BasicScanner {
125    /// Create a new scanner from input string
126    pub fn new(input: String) -> Self {
127        Self::with_limits(input, Limits::default())
128    }
129
130    /// Create a new scanner with custom resource limits
131    pub fn with_limits(input: String, limits: Limits) -> Self {
132        let char_cache: Vec<char> = input.chars().collect();
133        let current_char = char_cache.first().copied();
134
135        // Track document size for resource limits
136        let mut resource_tracker = ResourceTracker::new();
137        if let Err(e) = resource_tracker.add_bytes(&limits, input.len()) {
138            // If the input is too large, create scanner with error state
139            return Self {
140                current_char: None,
141                input,
142                position: Position::start(),
143                tokens: Vec::new(),
144                token_index: 0,
145                done: true,
146                indent_stack: vec![0],
147                current_indent: 0,
148                allow_simple_key: false,
149                simple_key_allowed: false,
150                flow_level: 0,
151                preserve_comments: false,
152                detected_indent_style: None,
153                indent_samples: Vec::new(),
154                previous_indent_level: 0,
155                buffer: String::new(),
156                char_cache: Vec::new(),
157                current_char_index: 0,
158                profiler: None,
159                scanning_error: Some(e),
160                limits,
161                resource_tracker,
162                inline_sequence_depth: 0,
163                compact_sequence_indents: Vec::new(),
164                indent_is_sequence: vec![false],
165            };
166        }
167
168        Self {
169            current_char,
170            input,
171            position: Position::start(),
172            tokens: Vec::new(),
173            token_index: 0,
174            done: false,
175            indent_stack: vec![0], // Always start with base indentation
176            current_indent: 0,
177            allow_simple_key: true,
178            simple_key_allowed: true,
179            flow_level: 0,
180            preserve_comments: false,
181            detected_indent_style: None,
182            indent_samples: Vec::new(),
183            previous_indent_level: 0,
184            buffer: String::with_capacity(64), // Pre-allocate buffer
185            char_cache,
186            current_char_index: 0,
187            profiler: std::env::var("RUST_YAML_PROFILE")
188                .ok()
189                .map(|_| crate::profiling::YamlProfiler::new()),
190            scanning_error: None,
191            limits,
192            resource_tracker,
193            inline_sequence_depth: 0,
194            compact_sequence_indents: Vec::new(),
195            indent_is_sequence: vec![false],
196        }
197    }
198
199    /// Create a new scanner with eager token scanning (for compatibility)
200    pub fn new_eager(input: String) -> Self {
201        Self::new_eager_with_limits(input, Limits::default())
202    }
203
204    /// Create a new scanner with eager token scanning and custom limits
205    pub fn new_eager_with_limits(input: String, limits: Limits) -> Self {
206        let mut scanner = Self::with_limits(input, limits);
207        // Store any scanning errors for later retrieval
208        if let Err(error) = scanner.scan_all_tokens() {
209            scanner.scanning_error = Some(error);
210        }
211        scanner
212    }
213
214    /// Create a new scanner with comment preservation enabled
215    pub fn new_with_comments(input: String) -> Self {
216        let mut scanner = Self::new(input);
217        scanner.preserve_comments = true;
218        scanner
219    }
220
221    /// Create a new scanner with comments and custom limits
222    pub fn new_with_comments_and_limits(input: String, limits: Limits) -> Self {
223        let mut scanner = Self::with_limits(input, limits);
224        scanner.preserve_comments = true;
225        scanner
226    }
227
228    /// Create a new scanner with eager scanning and comment preservation
229    pub fn new_eager_with_comments(input: String) -> Self {
230        let mut scanner = Self::new_with_comments(input);
231        // Mirror `new_eager_with_limits`: record scanning errors instead
232        // of discarding them (#19). Previously this used
233        // `unwrap_or(())`, silently truncating the token stream and
234        // returning a scanner whose `has_scanning_error()` reported
235        // false — silent data loss for comment-preserving callers.
236        if let Err(error) = scanner.scan_all_tokens() {
237            scanner.scanning_error = Some(error);
238        }
239        scanner
240    }
241
242    /// Get the detected indentation style from the document
243    pub const fn detected_indent_style(&self) -> Option<&crate::value::IndentStyle> {
244        self.detected_indent_style.as_ref()
245    }
246
247    /// Check if there was a scanning error
248    pub const fn has_scanning_error(&self) -> bool {
249        self.scanning_error.is_some()
250    }
251
252    /// Get the scanning error if any
253    #[allow(clippy::missing_const_for_fn)]
254    pub fn take_scanning_error(&mut self) -> Option<Error> {
255        self.scanning_error.take()
256    }
257
258    /// Advance to the next character
259    fn advance(&mut self) -> Option<char> {
260        if let Some(ch) = self.current_char {
261            self.position = self.position.advance(ch);
262            self.current_char_index += 1;
263
264            if self.current_char_index < self.char_cache.len() {
265                self.current_char = Some(self.char_cache[self.current_char_index]);
266            } else {
267                self.current_char = None;
268            }
269        }
270
271        self.current_char
272    }
273
274    /// Skip whitespace characters (excluding newlines)
275    fn skip_whitespace(&mut self) {
276        while let Some(ch) = self.current_char {
277            if ch == ' ' || ch == '\t' {
278                self.advance();
279            } else {
280                break;
281            }
282        }
283    }
284
285    /// Handle indentation and produce block tokens if necessary
286    fn handle_indentation(&mut self) -> Result<()> {
287        // In flow context: if there is a non-trivial enclosing block
288        // (indent_stack has more than the implicit root level), each
289        // continuation line that has content must be indented MORE than
290        // that enclosing block's indent. \`flow: [a,\\nb,c]\` with \`b\`
291        // at col 1 violates this rule because the block mapping enclosing
292        // \`flow:\` sits at indent 0 (yaml-test-suite 9C9N).
293        //
294        // Top-level flow (no enclosing block; indent_stack is just \[0\])
295        // is exempt — `[a,\\nb]` is fine there because the flow content
296        // isn't nested inside any block (yaml-test-suite 4ZYM).
297        if self.flow_level > 0 {
298            if self.indent_stack.len() > 1 || !self.compact_sequence_indents.is_empty() {
299                let mut probe = 0usize;
300                let mut i = self.current_char_index;
301                while i < self.char_cache.len() {
302                    match self.char_cache[i] {
303                        ' ' => {
304                            probe += 1;
305                            i += 1;
306                        }
307                        '\t' => i += 1,
308                        _ => break,
309                    }
310                }
311                let has_content = self
312                    .char_cache
313                    .get(i)
314                    .map_or(false, |c| !matches!(c, '\n' | '\r'));
315                // A line that begins with the matching flow closer
316                // (\`]\` / \`}\`) is allowed at the parent indent — it
317                // closes the flow collection, not adds content
318                // (yaml-test-suite NKF9 trailing-line \`}\` at col 1).
319                let is_closer = matches!(self.char_cache.get(i).copied(), Some(']' | '}'));
320                if has_content && !is_closer {
321                    let parent_indent = self.indent_stack.last().copied().unwrap_or(0);
322                    if probe <= parent_indent {
323                        return Err(Error::scan(
324                            self.position,
325                            "Flow content line is not indented enough".to_string(),
326                        ));
327                    }
328                }
329            }
330            return Ok(());
331        }
332
333        let line_start_pos = self.position;
334        let mut indent = 0;
335        let mut has_tabs = false;
336        let mut has_spaces = false;
337        let _indent_start_pos = self.position;
338
339        // Count indentation and detect style
340        while let Some(ch) = self.current_char {
341            if ch == ' ' {
342                indent += 1;
343                has_spaces = true;
344                self.advance();
345            } else if ch == '\t' {
346                indent += 8; // Tab counts as 8 spaces for indentation calculation
347                has_tabs = true;
348                self.advance();
349            } else {
350                break;
351            }
352        }
353
354        // Analyze indentation pattern for style detection
355        // Only analyze if there's actual content after the indentation (not just whitespace)
356        if indent > 0
357            && self.current_char.is_some()
358            && !matches!(self.current_char, Some('\n' | '\r'))
359        {
360            self.analyze_indentation_pattern(indent, has_tabs, has_spaces)?;
361        }
362
363        // YAML 1.2 §6.1 does NOT require all indents to be multiples
364        // of a single "indent width". Siblings must share a column;
365        // children must indent further; but any positive amount works
366        // (e.g. `key:\n  child:\n   grandchild:` with widths 2, 1
367        // is legal). The earlier strict-multiple-of-N check rejected
368        // valid spec fixtures like 6HB6, 8G76, A2M4, P94K, Q9WF,
369        // UGM3. We rely on the indent_stack-driven open/close logic
370        // (and the per-block "more than parent" rule enforced
371        // elsewhere) to catch genuine mis-indentation.
372
373        // Update previous indentation level for future comparisons
374        if indent > 0 {
375            self.previous_indent_level = indent;
376        }
377
378        // Update current indentation level
379        self.current_indent = indent;
380
381        // Close compact-notation sequences whose scope ends at this line.
382        // A compact sequence (where `-` shares the indent of the parent
383        // mapping keys) ends when the next content line at that indent is
384        // NOT a block entry (`- `).  We must emit the sequence's BlockEnd
385        // BEFORE popping the indent_stack so that the nesting order is
386        // correct (sequence closes before its parent mapping).
387        let has_content =
388            self.current_char.is_some() && !matches!(self.current_char, Some('\n' | '\r' | '#'));
389        if has_content {
390            let is_block_entry = self.current_char == Some('-')
391                && self.peek_char(1).map_or(true, |c| c.is_whitespace());
392            while let Some(&seq_indent) = self.compact_sequence_indents.last() {
393                if indent < seq_indent || (indent == seq_indent && !is_block_entry) {
394                    self.compact_sequence_indents.pop();
395                    self.tokens
396                        .push(Token::simple(TokenType::BlockEnd, line_start_pos));
397                } else {
398                    break;
399                }
400            }
401        }
402
403        // Check if we need to emit block end tokens for decreased indentation
404        let pre_pop_top = self.indent_stack.last().copied().unwrap_or(0);
405        while let Some(&last_indent) = self.indent_stack.last() {
406            if indent < last_indent && last_indent > 0 {
407                self.indent_stack.pop();
408                self.indent_is_sequence.pop();
409                self.tokens
410                    .push(Token::simple(TokenType::BlockEnd, line_start_pos));
411            } else {
412                break;
413            }
414        }
415
416        // §6.1: after a dedent, the new line's indent must match some
417        // existing container level — keys/items at a sibling level
418        // must share a column. Landing at a column that is between
419        // two stack levels (e.g. parent at 0, just-closed at 3, new
420        // line at 1) is invalid because no open mapping/sequence sits
421        // at indent 1 (yaml-test-suite DMG6, N4JP).
422        //
423        // The check applies only when:
424        //   * we actually dedented (pre-pop top was deeper than now),
425        //   * the new line has content (the next char is not blank /
426        //     newline / EOF / comment),
427        //   * indent doesn't match the new top.
428        if pre_pop_top > 0
429            && pre_pop_top > self.indent_stack.last().copied().unwrap_or(0)
430            && self
431                .current_char
432                .map_or(false, |c| !matches!(c, '\n' | '\r' | '#'))
433            && indent != self.indent_stack.last().copied().unwrap_or(0)
434        {
435            // Allow if indent is a valid deeper level — e.g.
436            // sibling at depth then deeper child — but for the
437            // dedent path indent must equal a known stack level.
438            return Err(Error::scan(
439                self.position,
440                format!(
441                    "Indentation {indent} doesn't match any open container (expected {} or deeper)",
442                    self.indent_stack.last().copied().unwrap_or(0)
443                ),
444            ));
445        }
446
447        Ok(())
448    }
449
450    /// Analyze indentation pattern to detect the document's indentation style
451    fn analyze_indentation_pattern(
452        &mut self,
453        current_indent: usize,
454        has_tabs: bool,
455        has_spaces: bool,
456    ) -> Result<()> {
457        // Prevent mixed indentation (tabs + spaces on same line).
458        // Carve-out: a tab AFTER one or more spaces and BEFORE
459        // value-position content (not a key) is content-area
460        // whitespace, not indentation. \`foo:\\n \\tbar\` — the 1
461        // space is indent, the tab is a separator before \`bar\`
462        // which is the value of \`foo:\` (yaml-test-suite DK95/00).
463        if has_tabs && has_spaces {
464            // Peek ahead: if the content after the tab+spaces area
465            // contains a key marker (`: ` or `:`+EOL), treat as
466            // indentation (invalid). Otherwise it's a value line.
467            let looks_like_key = self.line_after_indent_is_implicit_key();
468            if looks_like_key {
469                let context =
470                    crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
471                        .with_suggestion(
472                            "Use either tabs OR spaces for indentation, not both".to_string(),
473                        );
474                return Err(Error::invalid_character_with_context(
475                    self.position,
476                    '\t',
477                    "mixed indentation",
478                    context,
479                ));
480            }
481        }
482        // §6.1: indentation must be space characters only. Pure-tab
483        // indentation (\`\\tkey: value\`) is invalid (yaml-test-suite
484        // 4EJS). Two carve-outs:
485        //   * The mixed case is caught by the earlier branch.
486        //   * Tabs before a flow-collection opener (\`\\t[\`, \`\\t{\`)
487        //     at the root are not "block indentation" — there's no
488        //     enclosing block — and yaml-test-suite 6CA3 / Q5MG accept
489        //     them.
490        if has_tabs && !has_spaces && !matches!(self.current_char, Some('[' | '{')) {
491            let context = crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
492                .with_suggestion("Use space characters for indentation".to_string());
493            return Err(Error::invalid_character_with_context(
494                self.position,
495                '\t',
496                "indentation",
497                context,
498            ));
499        }
500
501        // If we detected tabs, check for mixed indentation across lines
502        if has_tabs {
503            match self.detected_indent_style {
504                None => {
505                    // First time detecting indentation style - set to tabs
506                    self.detected_indent_style = Some(crate::value::IndentStyle::Tabs);
507                }
508                Some(crate::value::IndentStyle::Spaces(_)) => {
509                    // Previously detected spaces, now seeing tabs - mixed indentation error
510                    let context =
511                        crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
512                            .with_suggestion(
513                                "Use consistent indentation style throughout the document"
514                                    .to_string(),
515                            );
516                    return Err(Error::invalid_character_with_context(
517                        self.position,
518                        '\t',
519                        "mixed indentation",
520                        context,
521                    ));
522                }
523                Some(crate::value::IndentStyle::Tabs) => {
524                    // Already using tabs - this is consistent
525                }
526            }
527            return Ok(());
528        }
529
530        // For spaces, check for mixed indentation across lines first
531        if has_spaces {
532            // Check if we previously detected tabs
533            if matches!(
534                self.detected_indent_style,
535                Some(crate::value::IndentStyle::Tabs)
536            ) {
537                let context =
538                    crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
539                        .with_suggestion(
540                            "Use consistent indentation style throughout the document".to_string(),
541                        );
542                return Err(Error::invalid_character_with_context(
543                    self.position,
544                    ' ',
545                    "mixed indentation",
546                    context,
547                ));
548            }
549
550            // Calculate the indentation level difference
551            if current_indent > self.previous_indent_level {
552                let indent_diff = current_indent - self.previous_indent_level;
553
554                // Store this sample for analysis (but only meaningful differences)
555                if indent_diff > 0 && indent_diff <= 8 {
556                    // Reasonable indentation range
557                    self.indent_samples.push((indent_diff, false));
558
559                    // Try to determine consistent indentation width
560                    if self.detected_indent_style.is_none() {
561                        self.detect_space_indentation_width();
562                    }
563                }
564            }
565
566            // YAML 1.2 §6.1 does NOT require all indents to be multiples
567            // of a single "indent width". Sibling lines must share a
568            // column and children must indent deeper than parents, but
569            // any positive amount works. The "multiple of N" check
570            // rejected valid spec fixtures (6HB6, M5C3, P94K, Q9WF,
571            // RZP5, UGM3, XW4D, A2M4); we rely on the indent_stack
572            // open/close logic for genuine mis-indentation. The detected
573            // style is still recorded for later style-preservation use
574            // (e.g. emitter), it just no longer drives validation.
575            // self.validate_indentation_consistency(current_indent)?;
576        }
577
578        Ok(())
579    }
580
581    /// Detect the consistent space indentation width from samples
582    fn detect_space_indentation_width(&mut self) {
583        if self.indent_samples.is_empty() {
584            return; // Need at least 1 sample
585        }
586
587        // Find the most common indentation width
588        let mut width_counts = std::collections::HashMap::new();
589
590        for &(width, is_tabs) in &self.indent_samples {
591            if !is_tabs && width > 0 {
592                *width_counts.entry(width).or_insert(0) += 1;
593            }
594        }
595
596        // Find the most frequent width - be more aggressive and detect early
597        if let Some((&most_common_width, &_count)) =
598            width_counts.iter().max_by_key(|&(_, count)| count)
599        {
600            // Set on first consistent sample to enable stricter validation
601            self.detected_indent_style = Some(crate::value::IndentStyle::Spaces(most_common_width));
602        }
603    }
604
605    /// Check if the given indentation level is valid based on current context
606    #[allow(clippy::missing_const_for_fn)] // Cannot be const due to self.detected_indent_style access
607    fn is_valid_indentation_level(&self, indent: usize) -> bool {
608        // For now, allow any indentation that could represent valid nesting
609        // In the future, this could be made more strict by checking against
610        // the current indent_stack to ensure proper nesting
611        if let Some(crate::value::IndentStyle::Spaces(width)) = self.detected_indent_style {
612            // Must be a multiple of the detected width
613            indent % width == 0
614        } else {
615            // If no style detected yet, allow any indentation
616            true
617        }
618    }
619
620    /// Validate that current indentation is consistent with detected style
621    fn validate_indentation_consistency(&self, current_indent: usize) -> Result<()> {
622        if let Some(crate::value::IndentStyle::Spaces(width)) = self.detected_indent_style {
623            // Check if current indentation is a multiple of the detected width
624            if current_indent > 0 && current_indent % width != 0 {
625                let lower_level = (current_indent / width) * width;
626                let higher_level = lower_level + width;
627                let suggestion = format!(
628                    "Expected indentation to be a multiple of {} spaces. Use {} or {} spaces instead of {}",
629                    width, lower_level, higher_level, current_indent
630                );
631                let context =
632                    crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
633                        .with_suggestion(suggestion);
634                return Err(Error::indentation_with_context(
635                    self.position,
636                    (current_indent / width) * width, // expected (nearest valid level)
637                    current_indent,                   // found
638                    context,
639                ));
640            }
641        }
642        Ok(())
643    }
644
645    /// Check if current position starts a plain scalar
646    fn is_plain_scalar_start(&self) -> bool {
647        self.current_char.map_or(false, |ch| match ch {
648            // Pure indicators — never start a plain scalar.
649            ',' | '[' | ']' | '{' | '}' | '#' | '&' | '*' | '!' | '|' | '>' | '\'' | '"' | '%'
650            | '@' | '`' => false,
651            // YAML 1.2 §7.3.3: `?`, `:`, `-` may start a plain scalar when
652            // the next character is non-whitespace (and, in flow context,
653            // not a flow indicator). Otherwise they act as indicators
654            // (complex-key marker / value separator / block-entry marker).
655            '?' | ':' | '-' => match self.peek_char(1) {
656                None => false,
657                Some(c) if c.is_whitespace() => false,
658                Some(c) if self.flow_level > 0 && ",[]{}".contains(c) => false,
659                Some(_) => true,
660            },
661            _ => !ch.is_whitespace(),
662        })
663    }
664
665    /// Check if the value is a YAML boolean
666    fn is_yaml_bool(value: &str) -> bool {
667        matches!(
668            value,
669            "true"
670                | "false"
671                | "True"
672                | "False"
673                | "TRUE"
674                | "FALSE"
675                | "yes"
676                | "no"
677                | "Yes"
678                | "No"
679                | "YES"
680                | "NO"
681                | "on"
682                | "off"
683                | "On"
684                | "Off"
685                | "ON"
686                | "OFF"
687        )
688    }
689
690    /// Check if the value is a YAML null
691    fn is_yaml_null(value: &str) -> bool {
692        matches!(value, "null" | "Null" | "NULL" | "~" | "")
693    }
694
695    /// Normalize a scalar value based on YAML rules.
696    ///
697    /// The scanner preserves the original text of plain scalars. Type
698    /// resolution (including version-aware bool/null mapping) happens in
699    /// the composer (see `crate::resolver::resolve_plain_scalar`). This
700    /// preserves enough information for the composer to apply the
701    /// YAML 1.1 vs 1.2 distinction and for round-trip emitters to
702    /// recover the original spelling.
703    fn normalize_scalar(value: String) -> String {
704        value
705    }
706
707    /// Scan a number token
708    fn scan_number(&mut self) -> Result<Token> {
709        let start_pos = self.position;
710        let mut value = String::new();
711
712        // Handle negative numbers
713        if self.current_char == Some('-') {
714            value.push('-');
715            self.advance();
716        }
717
718        // Scan digits
719        while let Some(ch) = self.current_char {
720            if ch.is_ascii_digit() {
721                value.push(ch);
722                self.advance();
723            } else if ch == '.' {
724                value.push(ch);
725                self.advance();
726                // Scan fractional part
727                while let Some(ch) = self.current_char {
728                    if ch.is_ascii_digit() {
729                        value.push(ch);
730                        self.advance();
731                    } else {
732                        break;
733                    }
734                }
735                break;
736            } else {
737                break;
738            }
739        }
740
741        Ok(Token::new(
742            TokenType::Scalar(value, tokens::QuoteStyle::Plain),
743            start_pos,
744            self.position,
745        ))
746    }
747
748    /// Scan a plain scalar (unquoted string)
749    fn scan_plain_scalar(&mut self) -> Result<Token> {
750        let start_pos = self.position;
751        let start_col = start_pos.column;
752        let mut value = String::new();
753        let mut multi_line = false;
754
755        loop {
756            // Scan content on the current line until we hit a stop condition.
757            while let Some(ch) = self.current_char {
758                if self.flow_level == 0 {
759                    match ch {
760                        '\n' | '\r' => break,
761                        ':' if self.peek_char(1).map_or(true, |c| c.is_whitespace()) => break,
762                        '#' if value.is_empty()
763                            || self.peek_char(-1).map_or(false, |c| c.is_whitespace()) =>
764                        {
765                            break;
766                        }
767                        _ => {}
768                    }
769                } else {
770                    match ch {
771                        // Same line-break handling as block context: stop
772                        // collecting raw content at `\n`/`\r`, then let the
773                        // outer fold logic decide whether the next line
774                        // continues this scalar (yaml-test-suite 8KB6,
775                        // 8UDB, 9BXH).
776                        '\n' | '\r' => break,
777                        ',' | '[' | ']' | '{' | '}' => break,
778                        // In flow context, `:` is a key-value separator
779                        // when followed by whitespace OR any flow indicator
780                        // (`,`, `[`, `]`, `{`, `}`). Tracked by yaml-test-
781                        // suite FRK4 (`{ ? foo :, ... }`).
782                        ':' if self
783                            .peek_char(1)
784                            .map_or(true, |c| c.is_whitespace() || ",[]{}".contains(c)) =>
785                        {
786                            break;
787                        }
788                        '#' if value.is_empty()
789                            || self.peek_char(-1).map_or(false, |c| c.is_whitespace()) =>
790                        {
791                            break;
792                        }
793                        _ => {}
794                    }
795                }
796                value.push(ch);
797                self.advance();
798            }
799
800            // If we didn't stop at a newline, this scalar is complete.
801            if !matches!(self.current_char, Some('\n' | '\r')) {
802                break;
803            }
804
805            // Per §6.5 line folding, trailing whitespace on the line is
806            // dropped (it gets replaced by the fold separator that the
807            // next continuation block emits).
808            while matches!(value.chars().last(), Some(' ' | '\t')) {
809                value.pop();
810            }
811
812            // YAML 1.2 §6.5 / §7.3.3: try to fold continuation lines into
813            // the same plain scalar. A continuation line must be:
814            //   * indented strictly more than the scalar's start column,
815            //   * not a document marker (`---` / `...`),
816            //   * not a comment-only line,
817            //   * not empty-with-EOF.
818            // Save state for backtracking if continuation isn't allowed.
819            let saved_position = self.position;
820            let saved_index = self.current_char_index;
821            let saved_char = self.current_char;
822
823            // Count physical newlines we skip; whitespace within the lines
824            // is also consumed.
825            let mut newlines = 0usize;
826            loop {
827                match self.current_char {
828                    Some('\n') => {
829                        newlines += 1;
830                        self.advance();
831                    }
832                    Some('\r') => {
833                        self.advance();
834                    }
835                    Some(' ' | '\t') => {
836                        self.advance();
837                    }
838                    _ => break,
839                }
840            }
841
842            let next_col = self.position.column;
843            let next_ch = self.current_char;
844            let is_doc_marker = matches!(next_ch, Some('-' | '.'))
845                && self.peek_char(1) == next_ch
846                && self.peek_char(2) == next_ch
847                && self.peek_char(3).map_or(true, |c| c.is_whitespace());
848
849            // Continuation column rule:
850            //   * Flow context: no column rule, only flow indicators
851            //     terminate (8KB6, 8UDB, 9BXH).
852            //   * Block context: must be strictly deeper than the parent
853            //     block's key column. The parent indent is the max of
854            //     `indent_stack.last()` (block mapping/sequence indent)
855            //     and `compact_sequence_indents.last()` — the latter
856            //     tracks sequences opened compactly (e.g. `? - x` where
857            //     the dash didn't push to indent_stack). Without the
858            //     compact-stack check, `? - Detroit Tigers\n  - Chicago`
859            //     would fold both lines into one scalar (yaml-test-
860            //     suite M5DY).
861            //     Fall back to `next_col >= start_col` for top-level
862            //     scalars where there's no enclosing block.
863            let column_ok = if self.flow_level > 0 {
864                true
865            } else {
866                let block_indent = self.indent_stack.last().copied().unwrap_or(0);
867                let compact_indent = self.compact_sequence_indents.last().copied().unwrap_or(0);
868                let parent_indent = block_indent.max(compact_indent);
869                next_col >= parent_indent + 2 || next_col >= start_col
870            };
871            let can_continue = next_ch.is_some()
872                && !matches!(next_ch, Some('\n' | '\r' | '#'))
873                && column_ok
874                && !is_doc_marker
875                && !(self.flow_level > 0 && matches!(next_ch, Some(',' | ']' | '}')));
876
877            if !can_continue {
878                self.position = saved_position;
879                self.current_char_index = saved_index;
880                self.current_char = saved_char;
881                break;
882            }
883
884            // Append fold separator: single newline → space; N>1 newlines
885            // collapse to N-1 retained newlines (YAML §6.5 line folding).
886            if newlines <= 1 {
887                value.push(' ');
888            } else {
889                for _ in 0..(newlines - 1) {
890                    value.push('\n');
891                }
892            }
893            multi_line = true;
894        }
895
896        // YAML 1.2 §8.1.3: implicit keys must be on a single line. If the
897        // plain scalar folded across line breaks AND the next non-
898        // whitespace char is `:` (key-value separator), it's about to be
899        // used as an implicit key — reject (yaml-test-suite G7JE).
900        if multi_line && self.flow_level == 0 {
901            let mut off = 0isize;
902            while matches!(self.peek_char(off), Some(' ' | '\t')) {
903                off += 1;
904            }
905            if self.peek_char(off) == Some(':') {
906                return Err(Error::scan(
907                    self.position,
908                    "Multi-line plain scalar may not be used as an implicit key".to_string(),
909                ));
910            }
911        }
912
913        self.resource_tracker
914            .check_string_length(&self.limits, value.len())?;
915
916        let value = value.trim_end().to_string();
917        let normalized_value = Self::normalize_scalar(value);
918
919        Ok(Token::new(
920            TokenType::Scalar(normalized_value, tokens::QuoteStyle::Plain),
921            start_pos,
922            self.position,
923        ))
924    }
925
926    /// Scan a quoted string
927    fn scan_quoted_string(&mut self, quote_char: char) -> Result<Token> {
928        let start_pos = self.position;
929        let mut value = String::new();
930
931        // Determine quote style based on quote character
932        let quote_style = match quote_char {
933            '\'' => tokens::QuoteStyle::Single,
934            '"' => tokens::QuoteStyle::Double,
935            _ => tokens::QuoteStyle::Plain,
936        };
937
938        self.advance(); // Skip opening quote
939        let mut closed = false;
940        let mut multi_line = false;
941        // High-water mark of bytes contributed by escape sequences. The
942        // trailing-whitespace strip at fold time must not pop past it,
943        // because an escape-produced \t / space is literal content
944        // (yaml-test-suite DE56/00, DE56/01).
945        let mut escape_end: usize = 0;
946
947        while let Some(ch) = self.current_char {
948            if ch == quote_char {
949                // YAML 1.2 §7.3.2 (Single-Quoted): `''` is the only escape,
950                // collapsing to a single `'`. Detect that here BEFORE
951                // treating the quote as the closing delimiter.
952                if quote_char == '\'' && self.peek_char(1) == Some('\'') {
953                    value.push('\'');
954                    self.advance();
955                    self.advance();
956                    continue;
957                }
958                self.advance(); // Skip closing quote
959                closed = true;
960                break;
961            } else if ch == '\\' && quote_char == '"' {
962                self.advance();
963                if let Some(escaped) = self.current_char {
964                    match escaped {
965                        // YAML 1.2 §5.7 double-quoted escape allowlist.
966                        'n' => value.push('\n'),
967                        't' => value.push('\t'),
968                        'r' => value.push('\r'),
969                        '\\' => value.push('\\'),
970                        '"' => value.push('"'),
971                        '0' => value.push('\0'),
972                        'a' => value.push('\x07'),
973                        'b' => value.push('\x08'),
974                        'f' => value.push('\x0C'),
975                        'v' => value.push('\x0B'),
976                        'e' => value.push('\x1B'),
977                        ' ' => value.push(' '),
978                        '/' => value.push('/'),
979                        'N' => value.push('\u{0085}'),
980                        '_' => value.push('\u{00A0}'),
981                        'L' => value.push('\u{2028}'),
982                        'P' => value.push('\u{2029}'),
983                        '\n' => {
984                            // Escaped line break (§7.3.2): the newline is
985                            // dropped AND leading whitespace on the next
986                            // line is excluded from the content.
987                            self.advance();
988                            while matches!(self.current_char, Some(' ' | '\t')) {
989                                self.advance();
990                            }
991                            continue;
992                        }
993                        '\t' => value.push('\t'), // literal tab after `\` → tab (yaml-test-suite 3RLN/DE56)
994                        // Hex / Unicode escapes per YAML 1.2 §5.7:
995                        //   \xNN     — 2 hex digits, codepoint  ≤ 0xFF
996                        //   \uNNNN   — 4 hex digits, codepoint  ≤ 0xFFFF
997                        //   \UNNNNNNNN — 8 hex digits, full Unicode codepoint
998                        'x' | 'u' | 'U' => {
999                            let n = match escaped {
1000                                'x' => 2,
1001                                'u' => 4,
1002                                _ => 8,
1003                            };
1004                            self.advance(); // consume the x/u/U
1005                            let mut codepoint: u32 = 0;
1006                            for _ in 0..n {
1007                                let c = self.current_char.ok_or_else(|| {
1008                                    Error::scan(
1009                                        self.position,
1010                                        format!("Truncated \\{escaped} escape"),
1011                                    )
1012                                })?;
1013                                let d = c.to_digit(16).ok_or_else(|| {
1014                                    Error::scan(
1015                                        self.position,
1016                                        format!("Invalid hex digit `{c}` in \\{escaped} escape"),
1017                                    )
1018                                })?;
1019                                codepoint = (codepoint << 4) | d;
1020                                self.advance();
1021                            }
1022                            let ch = char::from_u32(codepoint).ok_or_else(|| {
1023                                Error::scan(
1024                                    self.position,
1025                                    format!("Invalid Unicode codepoint U+{codepoint:X}"),
1026                                )
1027                            })?;
1028                            value.push(ch);
1029                            escape_end = value.len();
1030                            continue; // already advanced past hex digits
1031                        }
1032                        // Everything else is invalid per spec.
1033                        _ => {
1034                            return Err(Error::scan(
1035                                self.position,
1036                                format!("Invalid escape sequence: \\{escaped}"),
1037                            ));
1038                        }
1039                    }
1040                    escape_end = value.len();
1041                    self.advance();
1042                }
1043            } else if ch == '\\' {
1044                // Single-quoted strings have no backslash escapes — `\` is
1045                // a literal character. (Single-quote escape is `''`.)
1046                value.push(ch);
1047                self.advance();
1048            } else if ch == '\n' || ch == '\r' {
1049                // YAML 1.2 §7.3.2 (double-quoted) / §7.3.3 (single-quoted)
1050                // line folding: a single newline within a quoted scalar
1051                // folds to a space; N>1 consecutive newlines retain N-1;
1052                // leading whitespace on the continuation line is excluded.
1053                let mut newlines = 0usize;
1054                // §6.1: tabs cannot be indentation. A continuation
1055                // line that BEGINS with a tab (no leading spaces) in
1056                // an enclosing block context is invalid (yaml-test-
1057                // suite DK95/01). Tabs that appear AFTER spaces in
1058                // the same indent area are content, not indentation.
1059                let mut just_after_newline = false;
1060                while let Some(c) = self.current_char {
1061                    match c {
1062                        '\n' => {
1063                            newlines += 1;
1064                            multi_line = true;
1065                            self.advance();
1066                            just_after_newline = true;
1067                        }
1068                        '\r' => {
1069                            self.advance();
1070                        }
1071                        ' ' => {
1072                            self.advance();
1073                            just_after_newline = false;
1074                        }
1075                        '\t' if just_after_newline
1076                            && self.flow_level == 0
1077                            && (self.indent_stack.len() > 1
1078                                || !self.compact_sequence_indents.is_empty()) =>
1079                        {
1080                            return Err(Error::scan(
1081                                self.position,
1082                                "Tab cannot serve as indentation of quoted scalar continuation"
1083                                    .to_string(),
1084                            ));
1085                        }
1086                        '\t' => {
1087                            self.advance();
1088                        }
1089                        _ => break,
1090                    }
1091                }
1092                // §8.1.4: a multi-line quoted scalar inside a block
1093                // context must indent each continuation more than the
1094                // enclosing block. \`quoted: "a\\nb"\` with \`b\` at col 1
1095                // violates the rule because \`quoted:\` sits at indent 0
1096                // (yaml-test-suite QB6E). Only fires when there IS an
1097                // enclosing block (indent_stack > [0] or compact-seq
1098                // active) — top-level quoted scalars with continuation
1099                // at col 1 are legal.
1100                if newlines > 0
1101                    && self.flow_level == 0
1102                    && (self.indent_stack.len() > 1 || !self.compact_sequence_indents.is_empty())
1103                    && !matches!(self.current_char, None | Some('\n' | '\r'))
1104                {
1105                    let parent_indent = self.indent_stack.last().copied().unwrap_or(0);
1106                    let indent = self.position.column.saturating_sub(1);
1107                    if indent <= parent_indent {
1108                        return Err(Error::scan(
1109                            self.position,
1110                            "Quoted scalar continuation line is not indented enough".to_string(),
1111                        ));
1112                    }
1113                }
1114                // §6.8: a doc-start/end marker (`---` or `...`) at
1115                // column 1 always terminates the current document.
1116                // Encountering one inside an unterminated quoted
1117                // scalar is invalid — the quote escapes nothing past
1118                // the doc boundary (yaml-test-suite 5TRB, RXY3,
1119                // 9MQT/01).
1120                if self.position.column == 1 {
1121                    let next3: String = self
1122                        .char_cache
1123                        .get(self.current_char_index..self.current_char_index + 3)
1124                        .map(|s| s.iter().collect())
1125                        .unwrap_or_default();
1126                    if (next3 == "---" || next3 == "...")
1127                        && self
1128                            .char_cache
1129                            .get(self.current_char_index + 3)
1130                            .map_or(true, |c| c.is_whitespace())
1131                    {
1132                        return Err(Error::scan(
1133                            self.position,
1134                            format!(
1135                                "Document {} marker `{}` inside quoted scalar",
1136                                if next3 == "---" { "start" } else { "end" },
1137                                next3
1138                            ),
1139                        ));
1140                    }
1141                }
1142                // Drop trailing whitespace on the prior line (the bytes
1143                // we already pushed) before applying the fold. Don't
1144                // strip past `escape_end` — escape-produced whitespace
1145                // is literal content, not "trailing" line whitespace.
1146                while value.len() > escape_end && matches!(value.chars().last(), Some(' ' | '\t')) {
1147                    value.pop();
1148                }
1149                if newlines <= 1 {
1150                    value.push(' ');
1151                } else {
1152                    for _ in 0..(newlines - 1) {
1153                        value.push('\n');
1154                    }
1155                }
1156            } else {
1157                value.push(ch);
1158                self.advance();
1159
1160                // Check string length periodically to fail fast
1161                if value.len() > self.limits.max_string_length {
1162                    return Err(Error::limit_exceeded(format!(
1163                        "String length {} exceeds maximum {}",
1164                        value.len(),
1165                        self.limits.max_string_length
1166                    )));
1167                }
1168            }
1169        }
1170
1171        // Check string length limit
1172        if !closed {
1173            return Err(Error::scan(
1174                self.position,
1175                format!(
1176                    "Unclosed {} quoted string",
1177                    if quote_char == '"' {
1178                        "double"
1179                    } else {
1180                        "single"
1181                    }
1182                ),
1183            ));
1184        }
1185
1186        self.resource_tracker
1187            .check_string_length(&self.limits, value.len())?;
1188
1189        // YAML 1.2 §7.3.1 / §7.3.2: after the closing quote, the rest of
1190        // the line (or sub-expression in flow context) must be empty save
1191        // for a separator. Skip horizontal whitespace and look at the next
1192        // non-space char; if it's content rather than `,`/`:`/`}`/`]`/`#`/
1193        // newline/EOF, it's a trailing-content error (yaml-test-suite
1194        // Q4CL: `"quoted2" trailing content`).
1195        {
1196            let mut offset = 0isize;
1197            let mut saw_space = false;
1198            while matches!(self.peek_char(offset), Some(' ' | '\t')) {
1199                saw_space = true;
1200                offset += 1;
1201            }
1202            let next = self.peek_char(offset);
1203            // A `#` is a comment indicator ONLY when preceded by whitespace
1204            // (YAML 1.2 §6.6); `"value"#cmt` is invalid.
1205            let ok = match next {
1206                None => true,
1207                Some('#') => saw_space,
1208                Some(c) => matches!(c, ',' | ':' | '}' | ']' | '\n' | '\r'),
1209            };
1210            if !ok {
1211                return Err(Error::scan(
1212                    self.position,
1213                    format!("Unexpected `{}` after quoted scalar", next.unwrap_or(' ')),
1214                ));
1215            }
1216            // YAML 1.2 §8.1.3: implicit keys must be on a single line.
1217            // If the scalar folded across line breaks AND the next non-
1218            // whitespace char is `:` (key-value separator), the scalar
1219            // is being used as an implicit key — error.
1220            if multi_line && self.flow_level == 0 && next == Some(':') {
1221                return Err(Error::scan(
1222                    self.position,
1223                    "Multi-line quoted scalar may not be used as an implicit key".to_string(),
1224                ));
1225            }
1226        }
1227
1228        Ok(Token::new(
1229            TokenType::Scalar(value, quote_style),
1230            start_pos,
1231            self.position,
1232        ))
1233    }
1234
1235    /// Scan document start marker (---)
1236    fn scan_document_start(&mut self) -> Result<Option<Token>> {
1237        if self.current_char == Some('-')
1238            && self.peek_char(1) == Some('-')
1239            && self.peek_char(2) == Some('-')
1240            && self.peek_char(3).map_or(true, |c| c.is_whitespace())
1241        {
1242            // Doc markers are invalid inside flow collections.
1243            if self.flow_level > 0 {
1244                return Err(Error::scan(
1245                    self.position,
1246                    "`---` document-start marker is not allowed inside a flow collection"
1247                        .to_string(),
1248                ));
1249            }
1250            let start_pos = self.position;
1251            self.advance(); // -
1252            self.advance(); // -
1253            self.advance(); // -
1254
1255            Ok(Some(Token::new(
1256                TokenType::DocumentStart,
1257                start_pos,
1258                self.position,
1259            )))
1260        } else {
1261            Ok(None)
1262        }
1263    }
1264
1265    /// Scan YAML version directive (%YAML)
1266    fn scan_yaml_directive(&mut self) -> Result<Option<Token>> {
1267        if self.current_char != Some('%') {
1268            return Ok(None);
1269        }
1270
1271        let start_pos = self.position;
1272        let saved_position = self.position;
1273        let saved_char = self.current_char;
1274        let saved_char_index = self.current_char_index;
1275        self.advance(); // Skip '%'
1276
1277        // Check for "YAML"
1278        if self.current_char == Some('Y')
1279            && self.peek_char(1) == Some('A')
1280            && self.peek_char(2) == Some('M')
1281            && self.peek_char(3) == Some('L')
1282            && self.peek_char(4).map_or(false, |c| c.is_whitespace())
1283        {
1284            self.advance(); // Y
1285            self.advance(); // A
1286            self.advance(); // M
1287            self.advance(); // L
1288
1289            // Skip whitespace
1290            self.skip_whitespace();
1291
1292            // Parse version number (e.g., "1.2")
1293            let major = if let Some(ch) = self.current_char {
1294                if ch.is_ascii_digit() {
1295                    let digit = ch.to_digit(10).unwrap() as u8;
1296                    self.advance();
1297                    digit
1298                } else {
1299                    return Err(Error::scan(
1300                        self.position,
1301                        "Expected major version number after %YAML".to_string(),
1302                    ));
1303                }
1304            } else {
1305                return Err(Error::scan(
1306                    self.position,
1307                    "Expected version after %YAML directive".to_string(),
1308                ));
1309            };
1310
1311            // Expect '.'
1312            if self.current_char != Some('.') {
1313                return Err(Error::scan(
1314                    self.position,
1315                    "Expected '.' in YAML version".to_string(),
1316                ));
1317            }
1318            self.advance();
1319
1320            // Parse minor version
1321            let minor = if let Some(ch) = self.current_char {
1322                if ch.is_ascii_digit() {
1323                    let digit = ch.to_digit(10).unwrap() as u8;
1324                    self.advance();
1325                    digit
1326                } else {
1327                    return Err(Error::scan(
1328                        self.position,
1329                        "Expected minor version number after '.'".to_string(),
1330                    ));
1331                }
1332            } else {
1333                return Err(Error::scan(
1334                    self.position,
1335                    "Expected minor version number".to_string(),
1336                ));
1337            };
1338
1339            // YAML 1.2 §6.8.1: the directive line must end after the
1340            // version (modulo whitespace and an optional comment). Extra
1341            // tokens (e.g. `%YAML 1.2 foo`) are invalid — yaml-test-suite
1342            // H7TQ. Also `%YAML 1.1#...` (yaml-test-suite MUS6/00) needs
1343            // whitespace before `#`.
1344            let mut saw_space = false;
1345            while matches!(self.current_char, Some(' ' | '\t')) {
1346                saw_space = true;
1347                self.advance();
1348            }
1349            match self.current_char {
1350                None | Some('\n' | '\r') => {}
1351                Some('#') if saw_space => {
1352                    while let Some(ch) = self.current_char {
1353                        if ch == '\n' || ch == '\r' {
1354                            break;
1355                        }
1356                        self.advance();
1357                    }
1358                }
1359                Some(c) => {
1360                    return Err(Error::scan(
1361                        self.position,
1362                        format!("Unexpected `{c}` after %YAML directive"),
1363                    ));
1364                }
1365            }
1366
1367            Ok(Some(Token::new(
1368                TokenType::YamlDirective(major, minor),
1369                start_pos,
1370                self.position,
1371            )))
1372        } else {
1373            // Not a YAML directive: restore the exact pre-`%` scanner state
1374            // in O(1). The previous code linear-scanned a char_indices side
1375            // table; saving the two cursor fields is both faster and lets
1376            // that table be dropped entirely (#26).
1377            self.position = saved_position;
1378            self.current_char = saved_char;
1379            self.current_char_index = saved_char_index;
1380            Ok(None)
1381        }
1382    }
1383
1384    /// Scan TAG directive (%TAG)
1385    fn scan_tag_directive(&mut self) -> Result<Option<Token>> {
1386        if self.current_char != Some('%') {
1387            return Ok(None);
1388        }
1389
1390        let start_pos = self.position;
1391        let saved_position = self.position;
1392        let saved_char = self.current_char;
1393        let saved_char_index = self.current_char_index;
1394        self.advance(); // Skip '%'
1395
1396        // Check for "TAG"
1397        if self.current_char == Some('T')
1398            && self.peek_char(1) == Some('A')
1399            && self.peek_char(2) == Some('G')
1400            && self.peek_char(3).map_or(false, |c| c.is_whitespace())
1401        {
1402            self.advance(); // T
1403            self.advance(); // A
1404            self.advance(); // G
1405
1406            // Skip whitespace
1407            self.skip_whitespace();
1408
1409            // Parse handle (e.g., "!" or "!!")
1410            let handle = self.scan_tag_handle()?;
1411
1412            // Skip whitespace
1413            self.skip_whitespace();
1414
1415            // Parse prefix (URI)
1416            let prefix = self.scan_tag_prefix()?;
1417
1418            Ok(Some(Token::new(
1419                TokenType::TagDirective(handle, prefix),
1420                start_pos,
1421                self.position,
1422            )))
1423        } else {
1424            // Not a TAG directive: restore the exact pre-`%` scanner state
1425            // in O(1) (see the matching note in scan_yaml_directive, #26).
1426            self.position = saved_position;
1427            self.current_char = saved_char;
1428            self.current_char_index = saved_char_index;
1429            Ok(None)
1430        }
1431    }
1432
1433    /// Scan a tag handle for TAG directive
1434    fn scan_tag_handle(&mut self) -> Result<String> {
1435        let mut handle = String::new();
1436
1437        if self.current_char != Some('!') {
1438            return Err(Error::scan(
1439                self.position,
1440                "Expected '!' at start of tag handle".to_string(),
1441            ));
1442        }
1443
1444        handle.push('!');
1445        self.advance();
1446
1447        // Handle can be "!" or "!!" or "!name!"
1448        if self.current_char == Some('!') {
1449            // Secondary handle "!!"
1450            handle.push('!');
1451            self.advance();
1452        } else if self.current_char.map_or(false, |c| c.is_alphanumeric()) {
1453            // Named handle like "!name!"
1454            while let Some(ch) = self.current_char {
1455                if ch.is_alphanumeric() || ch == '-' || ch == '_' {
1456                    handle.push(ch);
1457                    self.advance();
1458                } else if ch == '!' {
1459                    handle.push(ch);
1460                    self.advance();
1461                    break;
1462                } else {
1463                    break;
1464                }
1465            }
1466        }
1467        // else just "!" primary handle
1468
1469        Ok(handle)
1470    }
1471
1472    /// Scan a tag prefix (URI) for TAG directive
1473    fn scan_tag_prefix(&mut self) -> Result<String> {
1474        let mut prefix = String::new();
1475
1476        // Read until end of line or comment
1477        while let Some(ch) = self.current_char {
1478            if ch == '\n' || ch == '\r' || ch == '#' {
1479                break;
1480            }
1481            if ch.is_whitespace() && prefix.is_empty() {
1482                self.advance();
1483                continue;
1484            }
1485            if ch.is_whitespace() && !prefix.is_empty() {
1486                // Trailing whitespace, we're done
1487                break;
1488            }
1489            prefix.push(ch);
1490            self.advance();
1491        }
1492
1493        if prefix.is_empty() {
1494            return Err(Error::scan(
1495                self.position,
1496                "Expected tag prefix after tag handle".to_string(),
1497            ));
1498        }
1499
1500        Ok(prefix.trim().to_string())
1501    }
1502
1503    /// Check if current position might be a directive
1504    fn is_directive(&self) -> bool {
1505        self.current_char == Some('%') && self.position.column == 1
1506    }
1507
1508    /// Scan document end marker (...)
1509    fn scan_document_end(&mut self) -> Result<Option<Token>> {
1510        if self.current_char == Some('.')
1511            && self.peek_char(1) == Some('.')
1512            && self.peek_char(2) == Some('.')
1513            && self.peek_char(3).map_or(true, |c| c.is_whitespace())
1514        {
1515            // Doc markers are invalid inside flow collections.
1516            if self.flow_level > 0 {
1517                return Err(Error::scan(
1518                    self.position,
1519                    "`...` document-end marker is not allowed inside a flow collection".to_string(),
1520                ));
1521            }
1522            let start_pos = self.position;
1523            self.advance(); // .
1524            self.advance(); // .
1525            self.advance(); // .
1526
1527            // YAML 1.2 §6.4: `...` must be followed only by whitespace or
1528            // end-of-line (comments allowed). Inline content after `...`
1529            // is invalid (yaml-test-suite 3HFZ).
1530            while let Some(ch) = self.current_char {
1531                match ch {
1532                    ' ' | '\t' => {
1533                        self.advance();
1534                    }
1535                    '\n' | '\r' | '#' => break,
1536                    _ => {
1537                        return Err(Error::scan(
1538                            self.position,
1539                            "Content after `...` document-end marker is invalid".to_string(),
1540                        ));
1541                    }
1542                }
1543            }
1544
1545            Ok(Some(Token::new(
1546                TokenType::DocumentEnd,
1547                start_pos,
1548                self.position,
1549            )))
1550        } else {
1551            Ok(None)
1552        }
1553    }
1554
1555    /// Scan a comment token
1556    fn scan_comment(&mut self) -> Result<Token> {
1557        let start_pos = self.position;
1558        let mut comment_text = String::new();
1559
1560        // Skip the '#' character
1561        if self.current_char == Some('#') {
1562            self.advance();
1563        }
1564
1565        // Collect the comment text
1566        while let Some(ch) = self.current_char {
1567            if ch == '\n' || ch == '\r' {
1568                break;
1569            }
1570            comment_text.push(ch);
1571            self.advance();
1572        }
1573
1574        // Trim leading whitespace from comment text
1575        let comment_text = comment_text.trim_start().to_string();
1576
1577        Ok(Token::new(
1578            TokenType::Comment(comment_text),
1579            start_pos,
1580            self.position,
1581        ))
1582    }
1583
1584    /// Process a line and generate appropriate tokens
1585    #[allow(clippy::cognitive_complexity)]
1586    fn process_line(&mut self) -> Result<()> {
1587        // Check for directives at start of line
1588        if self.position.column == 1 && self.current_char == Some('%') {
1589            // Try to scan YAML directive
1590            if let Some(token) = self.scan_yaml_directive()? {
1591                self.tokens.push(token);
1592                return Ok(());
1593            }
1594
1595            // Try to scan TAG directive
1596            if let Some(token) = self.scan_tag_directive()? {
1597                self.tokens.push(token);
1598                return Ok(());
1599            }
1600
1601            // YAML 1.2 §6.8.4: a YAML processor MUST ignore directives it
1602            // does not recognize. Skip the line silently — parsing continues
1603            // with whatever follows on the next line.
1604            if self.current_char == Some('%') {
1605                while let Some(ch) = self.current_char {
1606                    if ch == '\n' || ch == '\r' {
1607                        break;
1608                    }
1609                    self.advance();
1610                }
1611                return Ok(());
1612            }
1613        }
1614
1615        // Check for document markers at start of line
1616        if self.position.column == 1 {
1617            // Check for document start marker
1618            if let Some(token) = self.scan_document_start()? {
1619                self.tokens.push(token);
1620                return Ok(());
1621            }
1622
1623            // Check for document end marker
1624            if let Some(token) = self.scan_document_end()? {
1625                self.tokens.push(token);
1626                return Ok(());
1627            }
1628        }
1629
1630        // Handle indentation at start of line
1631        if self.position.column == 1 {
1632            self.handle_indentation()?;
1633        }
1634
1635        // Skip empty lines and comments
1636        self.skip_whitespace();
1637
1638        match self.current_char {
1639            None => return Ok(()),
1640            Some('#') => {
1641                if self.preserve_comments {
1642                    // Create a comment token
1643                    let comment_token = self.scan_comment()?;
1644                    self.tokens.push(comment_token);
1645                } else {
1646                    // Skip comment lines
1647                    while let Some(ch) = self.current_char {
1648                        if ch == '\n' || ch == '\r' {
1649                            break;
1650                        }
1651                        self.advance();
1652                    }
1653                }
1654                return Ok(());
1655            }
1656            Some('\n' | '\r') => {
1657                self.advance();
1658                return Ok(());
1659            }
1660            _ => {}
1661        }
1662
1663        // Process tokens on this line
1664        while let Some(ch) = self.current_char {
1665            match ch {
1666                '\n' | '\r' => break,
1667                ' ' | '\t' => {
1668                    self.skip_whitespace();
1669                }
1670                '#' => {
1671                    // YAML 1.2 §6.6: a comment must be preceded by whitespace
1672                    // OR be at the start of a line. Inputs like `,#invalid`
1673                    // (yaml-test-suite CVW2) are not valid comments.
1674                    let prev = self.peek_char(-1);
1675                    let at_line_start = self.position.column == 1;
1676                    let preceded_by_space = prev.map_or(true, |c| c.is_whitespace());
1677                    if !at_line_start && !preceded_by_space {
1678                        return Err(Error::scan(
1679                            self.position,
1680                            "Comment `#` must be preceded by whitespace".to_string(),
1681                        ));
1682                    }
1683                    if self.preserve_comments {
1684                        let comment_token = self.scan_comment()?;
1685                        self.tokens.push(comment_token);
1686                    } else {
1687                        while let Some(ch) = self.current_char {
1688                            if ch == '\n' || ch == '\r' {
1689                                break;
1690                            }
1691                            self.advance();
1692                        }
1693                    }
1694                    break;
1695                }
1696
1697                // Flow indicators. §7.4 allows a flow collection as
1698                // the implicit key of a block mapping (`[a]: b`,
1699                // `{x: y}: z`). When the flow-open is at line-start
1700                // (block context) and a `:` follows on the same line,
1701                // open the wrapping block mapping at the column of the
1702                // flow-open token, just as we do for line-start
1703                // properties (yaml-test-suite LX3P, 4FJ6, M2N8/01).
1704                '[' => {
1705                    if self.flow_level == 0
1706                        && self.position.column == self.current_indent + 1
1707                        && self.check_for_mapping_ahead()
1708                    {
1709                        self.maybe_open_block_mapping_for_key()?;
1710                    }
1711                    let pos = self.position;
1712                    self.advance();
1713                    self.flow_level += 1;
1714                    // Check depth limit
1715                    self.resource_tracker
1716                        .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
1717                    self.tokens
1718                        .push(Token::new(TokenType::FlowSequenceStart, pos, self.position));
1719                }
1720                ']' => {
1721                    // YAML 1.2 §7.4: `]` is only valid inside an open
1722                    // flow sequence. Stray `]` is a syntax error
1723                    // (yaml-test-suite 4H7K).
1724                    if self.flow_level == 0 {
1725                        let context = ErrorContext::from_input(&self.input, &self.position, 2)
1726                            .with_suggestion(
1727                                "Remove the extra `]` or open a flow sequence with `[` first"
1728                                    .to_string(),
1729                            );
1730                        return Err(Error::scan_with_context(
1731                            self.position,
1732                            "Unexpected `]` outside flow context",
1733                            context,
1734                        ));
1735                    }
1736                    let pos = self.position;
1737                    self.advance();
1738                    self.flow_level -= 1;
1739                    self.tokens
1740                        .push(Token::new(TokenType::FlowSequenceEnd, pos, self.position));
1741                }
1742                '{' => {
1743                    if self.flow_level == 0
1744                        && self.position.column == self.current_indent + 1
1745                        && self.check_for_mapping_ahead()
1746                    {
1747                        self.maybe_open_block_mapping_for_key()?;
1748                    }
1749                    let pos = self.position;
1750                    self.advance();
1751                    self.flow_level += 1;
1752                    // Check depth limit
1753                    self.resource_tracker
1754                        .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
1755                    self.tokens
1756                        .push(Token::new(TokenType::FlowMappingStart, pos, self.position));
1757                }
1758                '}' => {
1759                    if self.flow_level == 0 {
1760                        let context = ErrorContext::from_input(&self.input, &self.position, 2)
1761                            .with_suggestion(
1762                                "Remove the extra `}` or open a flow mapping with `{` first"
1763                                    .to_string(),
1764                            );
1765                        return Err(Error::scan_with_context(
1766                            self.position,
1767                            "Unexpected `}` outside flow context",
1768                            context,
1769                        ));
1770                    }
1771                    let pos = self.position;
1772                    self.advance();
1773                    self.flow_level -= 1;
1774                    self.tokens
1775                        .push(Token::new(TokenType::FlowMappingEnd, pos, self.position));
1776                }
1777                ',' => {
1778                    // §7.4: \`,\` is a flow indicator. Outside flow
1779                    // context it's not meaningful as a structural
1780                    // separator (yaml-test-suite U99R: \`- !!str, xxx\`
1781                    // — the comma after a tag in block context is
1782                    // invalid).
1783                    if self.flow_level == 0 {
1784                        return Err(Error::scan(
1785                            self.position,
1786                            "Unexpected `,` outside flow context".to_string(),
1787                        ));
1788                    }
1789                    let pos = self.position;
1790                    self.advance();
1791                    self.tokens
1792                        .push(Token::new(TokenType::FlowEntry, pos, self.position));
1793                }
1794
1795                // Key-value separator. YAML 1.2 §7.3.3 / §7.4:
1796                //   * Block context: `:` separates key from value only when
1797                //     followed by whitespace / EOF — otherwise it's part of
1798                //     a plain scalar (e.g. `:foo`, `URL://path`).
1799                //   * Flow context: same, plus `:` may be adjacent to a
1800                //     value when the previous token completed a key node
1801                //     (quoted/plain scalar, alias, or closed flow
1802                //     collection) — see yaml-test-suite 5MUD, 5T43.
1803                ':' if self.peek_char(1).map_or(true, |c| {
1804                    c.is_whitespace() || (self.flow_level > 0 && ",[]{}".contains(c))
1805                }) || (self.flow_level > 0
1806                    && matches!(
1807                        self.tokens.last().map(|t| &t.token_type),
1808                        Some(
1809                            TokenType::Scalar(_, _)
1810                                | TokenType::Alias(_)
1811                                | TokenType::FlowMappingEnd
1812                                | TokenType::FlowSequenceEnd
1813                        )
1814                    )) =>
1815                {
1816                    // §6.2: a \`:\` at line-start (the explicit-value
1817                    // counterpart of an explicit \`?\` key) must be
1818                    // followed by a SPACE — a tab as separator is
1819                    // invalid (yaml-test-suite Y79Y/007, /009).
1820                    if self.flow_level == 0
1821                        && self.position.column == self.current_indent + 1
1822                        && self.peek_char(1) == Some('\t')
1823                    {
1824                        return Err(Error::scan(
1825                            self.position,
1826                            "Tab cannot follow line-start `:` as explicit-value separator"
1827                                .to_string(),
1828                        ));
1829                    }
1830                    // §8.22: an implicit key in block context must fit
1831                    // on a single line. If the previous token is a
1832                    // flow-collection close whose matching open is on
1833                    // a different line, the flow node spans multiple
1834                    // lines and can't serve as the key (yaml-test-
1835                    // suite C2SP \`[23\\n]: 42\`).
1836                    if self.flow_level == 0 {
1837                        let mut is_flow_close = false;
1838                        let mut close_end_line = 0;
1839                        if let Some(last) = self.tokens.last() {
1840                            if matches!(
1841                                last.token_type,
1842                                TokenType::FlowSequenceEnd | TokenType::FlowMappingEnd
1843                            ) {
1844                                is_flow_close = true;
1845                                close_end_line = last.end_position.line;
1846                            }
1847                        }
1848                        if is_flow_close {
1849                            let mut depth = 0i32;
1850                            let mut open_idx: Option<usize> = None;
1851                            for (idx, t) in self.tokens.iter().enumerate().rev() {
1852                                match &t.token_type {
1853                                    TokenType::FlowSequenceEnd | TokenType::FlowMappingEnd => {
1854                                        depth += 1;
1855                                    }
1856                                    TokenType::FlowSequenceStart | TokenType::FlowMappingStart => {
1857                                        depth -= 1;
1858                                        if depth == 0 {
1859                                            open_idx = Some(idx);
1860                                            break;
1861                                        }
1862                                    }
1863                                    _ => {}
1864                                }
1865                            }
1866                            if let Some(oi) = open_idx {
1867                                let open_line = self.tokens[oi].start_position.line;
1868                                // If a `?` (Key) token precedes the
1869                                // matching flow open on the same line
1870                                // as the key, the key is explicit and
1871                                // may span lines (yaml-test-suite M5DY
1872                                // \`? [ ...spans... ]: [ ... ]\`).
1873                                let key_marker_before = self.tokens[..oi].iter().rev().any(|t| {
1874                                    matches!(t.token_type, TokenType::Key)
1875                                        && t.start_position.line == open_line
1876                                });
1877                                if !key_marker_before && open_line != close_end_line {
1878                                    return Err(Error::scan(
1879                                        self.position,
1880                                        "Implicit key in block context: flow collection key spans multiple lines"
1881                                            .to_string(),
1882                                    ));
1883                                }
1884                            }
1885                        }
1886                    }
1887                    let pos = self.position;
1888                    self.advance();
1889                    self.tokens
1890                        .push(Token::new(TokenType::Value, pos, self.position));
1891                }
1892
1893                // §6.2: the explicit-key marker \`?\` must be followed
1894                // by a SPACE (or EOL), not a tab. Tab as separator
1895                // after \`?\` is invalid (yaml-test-suite Y79Y/006, /008).
1896                '?' if self.flow_level == 0 && self.peek_char(1) == Some('\t') => {
1897                    return Err(Error::scan(
1898                        self.position,
1899                        "Tab cannot follow `?` as block-key separator".to_string(),
1900                    ));
1901                }
1902
1903                // Explicit key marker. An indented `?` at line-start
1904                // (e.g. `mapping:\\n  ? key`) opens an implicit block
1905                // mapping at this column — same as a line-start scalar
1906                // key. Without this, scan_plain_scalar wouldn't see
1907                // the inner mapping's indent and would wrongly fold
1908                // the key content into a multi-line scalar
1909                // (yaml-test-suite S9E8, KK5P).
1910                '?' if self.flow_level == 0
1911                    && (self.peek_char(1).map_or(true, |c| c.is_whitespace())
1912                        || self.peek_char(1).is_none()) =>
1913                {
1914                    if self.position.column == self.current_indent + 1 {
1915                        self.maybe_open_block_mapping_for_key()?;
1916                    }
1917                    let pos = self.position;
1918                    self.advance();
1919                    self.tokens
1920                        .push(Token::new(TokenType::Key, pos, self.position));
1921                }
1922                '?' if self.flow_level > 0
1923                    && (self
1924                        .peek_char(1)
1925                        .map_or(true, |c| c.is_whitespace() || ",:]}".contains(c))
1926                        || self.peek_char(1).is_none()) =>
1927                {
1928                    let pos = self.position;
1929                    self.advance();
1930                    self.tokens
1931                        .push(Token::new(TokenType::Key, pos, self.position));
1932                }
1933
1934                // Block entry
1935                '-' if self.flow_level == 0
1936                    && (self.peek_char(1).map_or(true, |c| c.is_whitespace())
1937                        || self.peek_char(1).is_none()) =>
1938                {
1939                    // A block-entry \`-\` immediately after a flow
1940                    // collection's close (\`}\`, \`]\`) ON THE SAME LINE
1941                    // is invalid — no separator between the closed
1942                    // flow node and the next sibling (yaml-test-suite
1943                    // P2EQ \`- { y: z }- invalid\`). The same-line guard
1944                    // is essential — a \`}\` on a previous line with a
1945                    // new \`-\` on the next line is perfectly valid.
1946                    //
1947                    // Likewise, a block-entry \`-\` immediately after a
1948                    // property (Anchor / Tag) on the same line is
1949                    // invalid — the property must precede a node, and
1950                    // a block sequence's first \`-\` must begin a line
1951                    // (yaml-test-suite SY6V \`&anchor - x\`).
1952                    if let Some(last) = self.tokens.last() {
1953                        if matches!(
1954                            last.token_type,
1955                            TokenType::FlowMappingEnd | TokenType::FlowSequenceEnd
1956                        ) && last.end_position.line == self.position.line
1957                        {
1958                            return Err(Error::scan(
1959                                self.position,
1960                                "Block-entry `-` immediately after flow collection close"
1961                                    .to_string(),
1962                            ));
1963                        }
1964                        if matches!(last.token_type, TokenType::Anchor(_) | TokenType::Tag(_))
1965                            && last.end_position.line == self.position.line
1966                        {
1967                            return Err(Error::scan(
1968                                self.position,
1969                                "Block-entry `-` cannot follow a property on the same line"
1970                                    .to_string(),
1971                            ));
1972                        }
1973                        // §8.22: a block sequence's first \`-\` must
1974                        // begin on a new line. \`key: - a\` (implicit
1975                        // key, then dash on same line) is invalid
1976                        // (yaml-test-suite 5U3A). But \`? key\\n: - x\`
1977                        // (explicit value-separator on the same line
1978                        // as the dash) IS valid: the \`?\` key sits
1979                        // on a previous line. We distinguish by
1980                        // walking back from the Value: if the
1981                        // preceding non-property token is a Scalar
1982                        // on the same line as the Value, the key
1983                        // is implicit; otherwise it's after \`?\`.
1984                        if matches!(last.token_type, TokenType::Value)
1985                            && last.end_position.line == self.position.line
1986                        {
1987                            let value_line = last.start_position.line;
1988                            let mut prior_scalar_line = None;
1989                            for t in self.tokens.iter().rev().skip(1) {
1990                                match &t.token_type {
1991                                    TokenType::Anchor(_) | TokenType::Tag(_) => {}
1992                                    TokenType::Scalar(..) => {
1993                                        prior_scalar_line = Some(t.end_position.line);
1994                                        break;
1995                                    }
1996                                    _ => break,
1997                                }
1998                            }
1999                            if prior_scalar_line == Some(value_line) {
2000                                return Err(Error::scan(
2001                                    self.position,
2002                                    "Block sequence value cannot start on the same line as its key"
2003                                        .to_string(),
2004                                ));
2005                            }
2006                        }
2007                    }
2008                    let pos = self.position;
2009                    self.advance();
2010
2011                    // Check if we need to start a new block sequence.
2012                    // `unwrap_or(0)` mirrors the pattern in
2013                    // src/scanner/indentation.rs and is safer than
2014                    // `.unwrap()` here: an error-recovery pop in another
2015                    // path could otherwise leave the stack empty and
2016                    // panic on crafted input (#18).
2017                    let last_indent = self.indent_stack.last().copied().unwrap_or(0);
2018
2019                    // If a compact sequence (opened from `? - x` or
2020                    // similar) is already active at this dash's column,
2021                    // the dash continues it — don't open a new nested
2022                    // block sequence (yaml-test-suite M5DY).
2023                    let dash_indent = pos.column.saturating_sub(1);
2024                    let compact_active_here = self
2025                        .compact_sequence_indents
2026                        .last()
2027                        .map_or(false, |&si| si == dash_indent);
2028                    if compact_active_here {
2029                        // Continuation of an existing compact sequence.
2030                    } else if self.current_indent > last_indent {
2031                        // Deeper indentation - start new nested sequence
2032                        self.indent_stack.push(self.current_indent);
2033                        self.indent_is_sequence.push(true);
2034                        // Check depth limit
2035                        self.resource_tracker
2036                            .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
2037                        self.tokens
2038                            .push(Token::simple(TokenType::BlockSequenceStart, pos));
2039                    } else if self.current_indent == last_indent
2040                        && *self.indent_is_sequence.last().unwrap_or(&false)
2041                    {
2042                        // Same indent and the top of stack is already a sequence
2043                        // → continuation of that sequence; no new start needed.
2044                    } else if self.current_indent >= last_indent {
2045                        // Same or root level — compact notation.
2046                        // Start a new sequence only if we don't already have one
2047                        // tracked at this exact indent.
2048                        // For a dash that's *not* at line-start (e.g.
2049                        // `? - x` where current_indent is still the
2050                        // line's indent but the dash sits in mid-line),
2051                        // use the dash column - 1 as the sequence's
2052                        // indent so scan_plain_scalar's continuation
2053                        // check correctly sees the deeper context
2054                        // (yaml-test-suite M5DY).
2055                        let dash_indent = pos.column.saturating_sub(1);
2056                        let seq_indent = dash_indent.max(self.current_indent);
2057                        let has_active_compact = self
2058                            .compact_sequence_indents
2059                            .last()
2060                            .map_or(false, |&si| si == seq_indent);
2061
2062                        if !has_active_compact {
2063                            self.compact_sequence_indents.push(seq_indent);
2064                            // Check depth limit
2065                            self.resource_tracker.check_depth(
2066                                &self.limits,
2067                                self.flow_level + self.indent_stack.len(),
2068                            )?;
2069                            self.tokens
2070                                .push(Token::simple(TokenType::BlockSequenceStart, pos));
2071                        }
2072                    }
2073
2074                    self.tokens
2075                        .push(Token::new(TokenType::BlockEntry, pos, self.position));
2076
2077                    // After emitting BlockEntry, check if the next
2078                    // token is another dash (nested sequence). §6.2
2079                    // requires SPACE separation between dashes — a
2080                    // tab between the outer and inner \`-\` is invalid
2081                    // (yaml-test-suite Y79Y/004, /005). Track whether
2082                    // a tab was consumed while skipping the inter-
2083                    // dash whitespace and reject if so.
2084                    let mut saw_tab_between = false;
2085                    while let Some(c) = self.current_char {
2086                        if c == ' ' {
2087                            self.advance();
2088                        } else if c == '\t' {
2089                            saw_tab_between = true;
2090                            self.advance();
2091                        } else {
2092                            break;
2093                        }
2094                    }
2095                    if self.current_char == Some('-')
2096                        && self.peek_char(1).map_or(true, |c| c.is_whitespace())
2097                        && saw_tab_between
2098                    {
2099                        return Err(Error::scan(
2100                            self.position,
2101                            "Tab between block-entries on same line".to_string(),
2102                        ));
2103                    }
2104                    if self.current_char == Some('-')
2105                        && self.peek_char(1).map_or(true, |c| c.is_whitespace())
2106                    {
2107                        // We have a nested sequence on the same line!
2108                        // Track this as an inline sequence
2109                        self.inline_sequence_depth += 1;
2110                        // Push the *indent* (column - 1), not the
2111                        // column, so it matches the convention used by
2112                        // maybe_open_block_mapping_for_key. With column
2113                        // here the next-line indent (column - 1) would
2114                        // be strictly less than the stored value and
2115                        // wrongly trigger an early close, breaking
2116                        // multi-line nested sequences (yaml-test-suite
2117                        // 3ALJ, 57H4).
2118                        self.indent_stack
2119                            .push(self.position.column.saturating_sub(1));
2120                        self.indent_is_sequence.push(true);
2121                        // Check depth limit
2122                        self.resource_tracker
2123                            .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
2124                        self.tokens
2125                            .push(Token::simple(TokenType::BlockSequenceStart, self.position));
2126                        // Continue processing - the next iteration will handle the nested dash
2127                    } else if self.current_char.is_some()
2128                        && !matches!(self.current_char, Some('\n' | '\r'))
2129                    {
2130                        // Content follows "- " on the same line.
2131                        // Update current_indent to the content's column position so that
2132                        // any mapping started here will be at a deeper indent level than
2133                        // the sequence. This ensures handle_indentation properly closes
2134                        // the mapping when the next sibling "- " appears.
2135                        self.current_indent = self.position.column - 1;
2136                    }
2137                }
2138
2139                // Quoted strings — same implicit-key mapping detection
2140                // as for plain scalars (yaml-test-suite 6H3V, 6SLA).
2141                '"' | '\'' => {
2142                    if self.flow_level == 0 && self.check_for_mapping_ahead() {
2143                        self.maybe_open_block_mapping_for_key()?;
2144                    }
2145                    let token = self.scan_quoted_string(ch)?;
2146                    self.tokens.push(token);
2147                }
2148
2149                // Document markers (only if not a block entry).
2150                //
2151                // Reached only when `-` is at column = current_indent + 1 AND
2152                // the next character is non-whitespace — i.e. either the
2153                // `---` document-start marker OR a plain scalar starting
2154                // with `-` (e.g. `---word1`, `-foo`). If `scan_document_start`
2155                // declines, we MUST consume the run as a plain scalar — not
2156                // consulting `is_plain_scalar_start` here, because that helper
2157                // unconditionally rejects `-`, which would leave the outer
2158                // `while let` loop spinning on the same character.
2159                '-' if self.position.column == self.current_indent + 1
2160                    && !self.peek_char(1).map_or(true, |c| c.is_whitespace()) =>
2161                {
2162                    if let Some(token) = self.scan_document_start()? {
2163                        self.tokens.push(token);
2164                    } else {
2165                        let token = self.scan_plain_scalar()?;
2166                        self.tokens.push(token);
2167                    }
2168                }
2169                '.' if self.position.column == self.current_indent + 1 => {
2170                    if let Some(token) = self.scan_document_end()? {
2171                        self.tokens.push(token);
2172                    } else if self.is_plain_scalar_start() {
2173                        let token = self.scan_plain_scalar()?;
2174                        self.tokens.push(token);
2175                    }
2176                }
2177
2178                // Numbers or plain scalars starting with -
2179                // Only scan as number if the entire token is numeric (no trailing letters)
2180                _ if (ch.is_ascii_digit()
2181                    || (ch == '-' && self.peek_char(1).map_or(false, |c| c.is_ascii_digit())))
2182                    && self.is_pure_number() =>
2183                {
2184                    // A numeric scalar can be an implicit mapping key just
2185                    // like any other scalar. Open the block mapping before
2186                    // the key token so `BlockMappingStart` is emitted —
2187                    // every other scalar dispatch arm does this; the number
2188                    // arm previously skipped it, so `421: null` parsed as a
2189                    // bare scalar instead of a mapping (#66).
2190                    if self.flow_level == 0 && self.check_for_mapping_ahead() {
2191                        self.maybe_open_block_mapping_for_key()?;
2192                    }
2193                    let token = self.scan_number()?;
2194                    self.tokens.push(token);
2195                }
2196
2197                // Anchors and aliases. §6.9: a node's properties
2198                // (anchor/tag) are prefixes of the node. When an `&`,
2199                // `*`, or `!` is at the start of a line (column ==
2200                // current_indent + 1) and a `: ` follows on the same
2201                // line, the property/alias is part of an implicit
2202                // key's leading position. The block mapping that
2203                // contains this key therefore opens at this column,
2204                // *before* the property/alias token is emitted
2205                // (yaml-test-suite 7BMT, 6BFJ, 9KAX, U3XV, 26DV).
2206                '&' => {
2207                    // Mirror H7J7 check for anchors (yaml-test-suite
2208                    // G9HC \`seq:\\n&anchor\\n- a\`).
2209                    if self.flow_level == 0
2210                        && self.position.column == self.current_indent + 1
2211                        && !self.check_for_mapping_ahead()
2212                        && self.indent_stack.len() > 1
2213                        && self.current_indent == self.indent_stack[self.indent_stack.len() - 2]
2214                        && self.most_recent_token_is_value_separator()
2215                    {
2216                        return Err(Error::scan(
2217                            self.position,
2218                            "Anchor at line-start with insufficient indent for value position"
2219                                .to_string(),
2220                        ));
2221                    }
2222                    if self.flow_level == 0
2223                        && self.position.column == self.current_indent + 1
2224                        && self.check_for_mapping_ahead()
2225                    {
2226                        self.maybe_open_block_mapping_for_key()?;
2227                    }
2228                    let token = self.scan_anchor()?;
2229                    self.tokens.push(token);
2230                }
2231                '*' => {
2232                    // §6.9.2: alias/anchor names may contain \`:\` (only
2233                    // flow indicators and whitespace terminate them).
2234                    // So \`*a:\` is an alias named \`a:\`, NOT an alias
2235                    // \`*a\` followed by a key separator. Don't open
2236                    // an implicit block mapping in that case (yaml-
2237                    // test-suite 2SXE).
2238                    if self.flow_level == 0
2239                        && self.position.column == self.current_indent + 1
2240                        && self.check_for_mapping_ahead()
2241                        && !self.colon_belongs_to_alias_anchor_name()
2242                    {
2243                        self.maybe_open_block_mapping_for_key()?;
2244                    }
2245                    let token = self.scan_alias()?;
2246                    self.tokens.push(token);
2247                }
2248
2249                // Block scalars
2250                '|' => {
2251                    let token = self.scan_literal_block_scalar()?;
2252                    self.tokens.push(token);
2253                    // Block scalar collection rewinds the cursor to the
2254                    // start of the next under-indented line. `current_indent`
2255                    // is still set to the inline content's column from the
2256                    // enclosing `- |` / `key: |` site, so the next iteration
2257                    // would mis-dispatch. Break out so the outer loop
2258                    // re-enters `process_line` and reruns indent handling
2259                    // (yaml-test-suite 4QFQ, M6YH, P2AD).
2260                    break;
2261                }
2262                '>' => {
2263                    let token = self.scan_folded_block_scalar()?;
2264                    self.tokens.push(token);
2265                    break;
2266                }
2267
2268                // Tags. Same line-start property-opens-mapping rule
2269                // (yaml-test-suite ZH7C variants).
2270                //
2271                // §6.9: a property at the SAME indent as the
2272                // enclosing mapping/sequence cannot apply to that
2273                // collection's value — the value must be more
2274                // indented. If we're at a line-start \`!\` whose column
2275                // equals the enclosing mapping's indent + 1 AND that
2276                // mapping currently has a key awaiting a value, the
2277                // tag is misplaced (yaml-test-suite H7J7).
2278                '!' => {
2279                    if self.flow_level == 0
2280                        && self.position.column == self.current_indent + 1
2281                        && !self.check_for_mapping_ahead()
2282                        && self.indent_stack.len() > 1
2283                        && self.current_indent == self.indent_stack[self.indent_stack.len() - 2]
2284                        && self.most_recent_token_is_value_separator()
2285                    {
2286                        return Err(Error::scan(
2287                            self.position,
2288                            "Tag at line-start with insufficient indent for value position"
2289                                .to_string(),
2290                        ));
2291                    }
2292                    if self.flow_level == 0
2293                        && self.position.column == self.current_indent + 1
2294                        && self.check_for_mapping_ahead()
2295                    {
2296                        self.maybe_open_block_mapping_for_key()?;
2297                    }
2298                    let token = self.scan_tag()?;
2299                    self.tokens.push(token);
2300                }
2301
2302                // Plain scalars
2303                _ if self.is_plain_scalar_start() => {
2304                    // A plain scalar starting on the SAME line as a
2305                    // flow-collection close (\`}\` or \`]\`) means there's
2306                    // no separator between the closed flow node and
2307                    // the new content (yaml-test-suite 62EZ
2308                    // \`x: { y: z }in: valid\`).
2309                    if self.flow_level == 0 {
2310                        if let Some(last) = self.tokens.last() {
2311                            if matches!(
2312                                last.token_type,
2313                                TokenType::FlowMappingEnd | TokenType::FlowSequenceEnd
2314                            ) && last.end_position.line == self.position.line
2315                            {
2316                                return Err(Error::scan(
2317                                    self.position,
2318                                    "Plain scalar immediately after flow collection close"
2319                                        .to_string(),
2320                                ));
2321                            }
2322                        }
2323                    }
2324                    if self.flow_level == 0 && self.check_for_mapping_ahead() {
2325                        self.maybe_open_block_mapping_for_key()?;
2326                    }
2327
2328                    let token = self.scan_plain_scalar()?;
2329                    self.tokens.push(token);
2330                }
2331
2332                _ => {
2333                    let context = ErrorContext::from_input(&self.input, &self.position, 2)
2334                        .with_suggestion("Check for valid YAML syntax characters".to_string());
2335                    return Err(Error::invalid_character_with_context(
2336                        self.position,
2337                        ch,
2338                        "YAML document",
2339                        context,
2340                    ));
2341                }
2342            }
2343        }
2344
2345        // Inline sequences (nested \`- -\` on one line) used to be
2346        // closed unconditionally at end-of-line. But a nested sequence
2347        // can span lines (`- - a\n  - b\n- c`) — in that case the inner
2348        // sequence must remain open until handle_indentation sees a
2349        // dedent. Reset the inline-sequence counter (so the next line
2350        // is judged on its own merits) but DO NOT emit BlockEnd —
2351        // handle_indentation's indent_stack pop, the end-of-stream
2352        // close at scan_next_token, and the explicit-dedent close at
2353        // handle_indentation's bottom each provide a correct close.
2354        self.inline_sequence_depth = 0;
2355
2356        Ok(())
2357    }
2358
2359    /// Scan the next token lazily
2360    fn scan_next_token(&mut self) -> Result<()> {
2361        if self.done {
2362            return Ok(());
2363        }
2364
2365        // Add stream start token if this is the beginning
2366        if self.tokens.is_empty() {
2367            self.tokens
2368                .push(Token::simple(TokenType::StreamStart, self.position));
2369            return Ok(());
2370        }
2371
2372        // Check if we're at the end of input
2373        if self.current_char.is_none() {
2374            if !self
2375                .tokens
2376                .iter()
2377                .any(|t| matches!(t.token_type, TokenType::StreamEnd))
2378            {
2379                self.tokens
2380                    .push(Token::simple(TokenType::StreamEnd, self.position));
2381            }
2382            self.done = true;
2383            return Ok(());
2384        }
2385
2386        // For now, fall back to scanning all tokens at once for the lazy scanner
2387        // This is a simplified implementation - a full streaming parser would
2388        // need more sophisticated state management
2389        let tokens_before = self.tokens.len();
2390        self.scan_all_tokens()?;
2391
2392        // Mark as done after scanning all tokens
2393        if self.tokens.len() == tokens_before {
2394            self.done = true;
2395        }
2396
2397        Ok(())
2398    }
2399
2400    /// Pre-scan all tokens (simplified approach for basic implementation)
2401    fn scan_all_tokens(&mut self) -> Result<()> {
2402        // Only add StreamStart if we don't have it yet
2403        if !self
2404            .tokens
2405            .iter()
2406            .any(|t| matches!(t.token_type, TokenType::StreamStart))
2407        {
2408            self.tokens
2409                .push(Token::simple(TokenType::StreamStart, self.position));
2410        }
2411
2412        while self.current_char.is_some() {
2413            self.process_line()?;
2414
2415            // Advance past newlines
2416            while let Some(ch) = self.current_char {
2417                if ch == '\n' || ch == '\r' {
2418                    self.advance();
2419                } else {
2420                    break;
2421                }
2422            }
2423        }
2424
2425        // Close any remaining compact sequences (before their parent mappings)
2426        while self.compact_sequence_indents.pop().is_some() {
2427            self.tokens
2428                .push(Token::simple(TokenType::BlockEnd, self.position));
2429        }
2430
2431        // Close any remaining blocks
2432        while self.indent_stack.len() > 1 {
2433            self.indent_stack.pop();
2434            self.indent_is_sequence.pop();
2435            self.tokens
2436                .push(Token::simple(TokenType::BlockEnd, self.position));
2437        }
2438
2439        self.tokens
2440            .push(Token::simple(TokenType::StreamEnd, self.position));
2441        self.done = true;
2442        Ok(())
2443    }
2444
2445    /// Peek at a character at the given offset (can be negative)
2446    /// Check if the current position starts a pure number (digits/dots/minus only,
2447    /// not followed by letters). Values like 500m, 128Mi should be treated as plain scalars.
2448    fn is_pure_number(&self) -> bool {
2449        let mut offset: isize = 0;
2450        let first = self.peek_char(0);
2451        // Skip leading minus
2452        if first == Some('-') {
2453            offset = 1;
2454        }
2455        // Scan digits and at most one dot
2456        let mut has_digit = false;
2457        let mut dot_count = 0;
2458        loop {
2459            match self.peek_char(offset) {
2460                Some(c) if c.is_ascii_digit() => {
2461                    has_digit = true;
2462                    offset += 1;
2463                }
2464                Some('.') => {
2465                    dot_count += 1;
2466                    if dot_count > 1 {
2467                        // Multiple dots (e.g. 0.5.8) — not a number
2468                        return false;
2469                    }
2470                    offset += 1;
2471                }
2472                Some(c) if c.is_ascii_alphabetic() || c == '_' => {
2473                    // Letters follow the digits — not a pure number (e.g. 500m, 128Mi)
2474                    return false;
2475                }
2476                Some(c) => {
2477                    // For a token to be a pure number, what follows
2478                    // the digits must be end-of-token. In flow
2479                    // context that's a flow indicator. In block
2480                    // context the rest of the line must be pure
2481                    // whitespace (possibly trailing a comment) — if
2482                    // there's more non-whitespace content on this
2483                    // line, the digits are part of a larger plain
2484                    // scalar like \`1 - 3\` (yaml-test-suite P76L)
2485                    // or \`20:03:20\` (yaml-test-suite U9NS).
2486                    if self.flow_level > 0 && ",[]{}".contains(c) {
2487                        return has_digit;
2488                    }
2489                    if c == '\n' || c == '\r' {
2490                        return has_digit;
2491                    }
2492                    if c == ' ' || c == '\t' {
2493                        // Look ahead: rest of line must be whitespace
2494                        // or a comment.
2495                        let mut probe = offset + 1;
2496                        loop {
2497                            match self.peek_char(probe) {
2498                                None => return has_digit,
2499                                Some('\n' | '\r') => return has_digit,
2500                                Some('#') => return has_digit,
2501                                Some(' ' | '\t') => probe += 1,
2502                                Some(_) => return false,
2503                            }
2504                        }
2505                    }
2506                    if c == ':' {
2507                        let next = self.peek_char(offset + 1);
2508                        return has_digit && next.map_or(true, |nc| nc.is_whitespace());
2509                    }
2510                    return false;
2511                }
2512                None => return has_digit,
2513            }
2514        }
2515    }
2516
2517    fn peek_char(&self, offset: isize) -> Option<char> {
2518        // `unsigned_abs()` yields the magnitude as `usize` and is total — it
2519        // is defined even for `isize::MIN`, where `-offset` overflows (panic
2520        // in debug, wrapping UB in release). `checked_add`/`checked_sub` then
2521        // make an out-of-range index a `None` rather than a panic (#20).
2522        let magnitude = offset.unsigned_abs();
2523        let target_index = if offset >= 0 {
2524            self.current_char_index.checked_add(magnitude)?
2525        } else {
2526            self.current_char_index.checked_sub(magnitude)?
2527        };
2528        self.char_cache.get(target_index).copied()
2529    }
2530
2531    /// Scan an anchor token (&name)
2532    fn scan_anchor(&mut self) -> Result<Token> {
2533        let start_pos = self.position;
2534        self.advance(); // Skip '&'
2535
2536        let name = self.scan_identifier()?;
2537        if name.is_empty() {
2538            let context = ErrorContext::from_input(&self.input, &self.position, 2).with_suggestion(
2539                "Provide a valid anchor name after &, e.g., &anchor_name".to_string(),
2540            );
2541            return Err(Error::scan_with_context(
2542                self.position,
2543                "Anchor name cannot be empty",
2544                context,
2545            ));
2546        }
2547
2548        // Track anchor for resource limits
2549        self.resource_tracker.add_anchor(&self.limits)?;
2550
2551        Ok(Token::new(
2552            TokenType::Anchor(name),
2553            start_pos,
2554            self.position,
2555        ))
2556    }
2557
2558    /// Scan an alias token (*name)
2559    fn scan_alias(&mut self) -> Result<Token> {
2560        let start_pos = self.position;
2561        self.advance(); // Skip '*'
2562
2563        let name = self.scan_identifier()?;
2564        if name.is_empty() {
2565            let context = ErrorContext::from_input(&self.input, &self.position, 2).with_suggestion(
2566                "Provide a valid alias name after *, e.g., *alias_name".to_string(),
2567            );
2568            return Err(Error::scan_with_context(
2569                self.position,
2570                "Alias name cannot be empty",
2571                context,
2572            ));
2573        }
2574
2575        Ok(Token::new(TokenType::Alias(name), start_pos, self.position))
2576    }
2577
2578    /// Scan an identifier (used for anchor and alias names)
2579    fn scan_identifier(&mut self) -> Result<String> {
2580        // Per YAML 1.2 §6.9.2 (ns-anchor-name = ns-anchor-char+), the only
2581        // exclusions are whitespace and the flow indicators `,[]{}`. This
2582        // accepts ASCII alphanumeric, underscore, hyphen, AND full unicode
2583        // codepoints (including emoji), matching the spec exactly.
2584        let mut identifier = String::new();
2585        while let Some(ch) = self.current_char {
2586            if ch.is_whitespace() || matches!(ch, ',' | '[' | ']' | '{' | '}') {
2587                break;
2588            }
2589            identifier.push(ch);
2590            // Cap heap growth before an attacker-controlled anchor/alias name
2591            // can exhaust memory: bail the moment it exceeds max_string_length,
2592            // rather than after the full String is materialized (#24).
2593            self.resource_tracker
2594                .check_string_length(&self.limits, identifier.len())?;
2595            self.advance();
2596        }
2597        Ok(identifier)
2598    }
2599
2600    /// Scan a tag token (`!tag`, `!!tag`, or `!<verbatim>`).
2601    fn scan_tag(&mut self) -> Result<Token> {
2602        let start_pos = self.position;
2603        self.advance(); // Skip first '!'
2604
2605        let mut tag = String::from("!");
2606
2607        // Check for verbatim tag format: !<tag>
2608        if self.current_char == Some('<') {
2609            tag.push('<');
2610            self.advance(); // Skip '<'
2611
2612            // Scan until closing '>'
2613            while let Some(ch) = self.current_char {
2614                if ch == '>' {
2615                    tag.push(ch);
2616                    self.advance();
2617                    break;
2618                } else if ch.is_control() || ch.is_whitespace() {
2619                    return Err(Error::scan(
2620                        self.position,
2621                        "Invalid character in verbatim tag".to_string(),
2622                    ));
2623                }
2624                tag.push(ch);
2625                self.advance();
2626            }
2627        } else {
2628            // Check for secondary tag handle: !!
2629            if self.current_char == Some('!') {
2630                tag.push('!');
2631                self.advance(); // Skip second '!'
2632            }
2633
2634            // Scan tag name/suffix.
2635            //
2636            // Per YAML 1.2 §5.6, tag suffixes are URI references — they may
2637            // contain any URI character (RFC 3986 unreserved + sub-delims +
2638            // a few others) or `%XX` percent-encoded bytes. The handful of
2639            // characters listed below covers the alphanumeric + URI-safe
2640            // punctuation set used by yaml-test-suite. Percent decoding of
2641            // `%XX` happens later in `TagResolver::resolve`.
2642            //
2643            // §5.3: inside a flow collection, the flow indicators
2644            // `,`, `[`, `]`, `{`, `}` always terminate a node — so we
2645            // must NOT consume them into the tag suffix even though
2646            // RFC 3986 permits them in URIs (yaml-test-suite WZ62).
2647            // YAML 1.2 in practice treats `,` as a flow indicator that
2648            // must be percent-encoded (\`%2C\`) when it appears inside
2649            // a tag suffix — bare \`,\` is not allowed in EITHER block
2650            // or flow context (yaml-test-suite U99R).
2651            while let Some(ch) = self.current_char {
2652                if matches!(ch, ',') {
2653                    break;
2654                }
2655                if self.flow_level > 0 && matches!(ch, '[' | ']' | '{' | '}') {
2656                    break;
2657                }
2658                // §6.8 / §5.6: `:` IS a valid tag URI character — e.g.
2659                // `tag:yaml.org,2002:str` legitimately contains two
2660                // colons inside its URI. But a `:` followed by
2661                // whitespace, EOL or EOF is the YAML mapping-value
2662                // indicator and MUST terminate the tag, otherwise
2663                // `!handle!suffix: value` is mis-scanned as
2664                // `Tag("!handle!suffix:") Scalar("value")` and the
2665                // implicit-key mapping structure is lost. Mirrors the
2666                // `,` carve-out above (a valid URI char that's also a
2667                // YAML flow indicator in some contexts).
2668                if ch == ':' {
2669                    match self.peek_char(1) {
2670                        None => break,
2671                        Some(c) if c.is_whitespace() => break,
2672                        _ => {}
2673                    }
2674                }
2675                if ch.is_alphanumeric() || "-._~:/?#[]@!$&'()*+;=%".contains(ch) {
2676                    tag.push(ch);
2677                    self.advance();
2678                } else {
2679                    break;
2680                }
2681            }
2682        }
2683
2684        Ok(Token::new(TokenType::Tag(tag), start_pos, self.position))
2685    }
2686
2687    /// Scan a literal block scalar (|)
2688    fn scan_literal_block_scalar(&mut self) -> Result<Token> {
2689        let start_pos = self.position;
2690        self.advance(); // Skip '|'
2691
2692        // Parse block scalar header (indicators like +, -, explicit indent)
2693        let (chomping, explicit_indent) = self.scan_block_scalar_header()?;
2694
2695        // Skip to next line
2696        self.skip_to_next_line()?;
2697
2698        // Determine indentation. `base_indent` is the surrounding
2699        // block's indent — i.e. the indent of the sequence or
2700        // mapping that contains this scalar. `self.current_indent`
2701        // is sometimes set to the inline indicator column (e.g. 2
2702        // for `- |`), which would make `base_indent + explicit`
2703        // wrong; use the top of `indent_stack` instead
2704        // (yaml-test-suite 4QFQ `|1`).
2705        let base_indent = self.indent_stack.last().copied().unwrap_or(0);
2706        let content_indent = if let Some(explicit) = explicit_indent {
2707            base_indent + explicit
2708        } else {
2709            // Find the first non-empty content line to determine indentation
2710            self.find_block_scalar_indent(base_indent)?
2711        };
2712
2713        // Collect the literal block content
2714        let content = self.collect_literal_block_content(content_indent, chomping)?;
2715
2716        Ok(Token::new(
2717            TokenType::BlockScalarLiteral(content),
2718            start_pos,
2719            self.position,
2720        ))
2721    }
2722
2723    /// Scan a folded block scalar (>)
2724    fn scan_folded_block_scalar(&mut self) -> Result<Token> {
2725        let start_pos = self.position;
2726        self.advance(); // Skip '>'
2727
2728        // Parse block scalar header (indicators like +, -, explicit indent)
2729        let (chomping, explicit_indent) = self.scan_block_scalar_header()?;
2730
2731        // Skip to next line
2732        self.skip_to_next_line()?;
2733
2734        // See scan_literal_block_scalar for why we read `indent_stack`
2735        // rather than `current_indent`.
2736        let base_indent = self.indent_stack.last().copied().unwrap_or(0);
2737        let content_indent = if let Some(explicit) = explicit_indent {
2738            base_indent + explicit
2739        } else {
2740            // Find the first non-empty content line to determine indentation
2741            self.find_block_scalar_indent(base_indent)?
2742        };
2743
2744        // Collect the folded block content
2745        let content = self.collect_folded_block_content(content_indent, chomping)?;
2746
2747        Ok(Token::new(
2748            TokenType::BlockScalarFolded(content),
2749            start_pos,
2750            self.position,
2751        ))
2752    }
2753
2754    /// Parse block scalar header indicators (+, -, and explicit indent)
2755    fn scan_block_scalar_header(&mut self) -> Result<(ChompingMode, Option<usize>)> {
2756        let mut chomping = ChompingMode::Clip;
2757        let mut explicit_indent: Option<usize> = None;
2758        // §6.6: a comment must be preceded by whitespace. \`|#x\` and
2759        // \`>#x\` are invalid (yaml-test-suite X4QW).
2760        let mut seen_separator_ws = false;
2761
2762        // Parse indicators in any order
2763        while let Some(ch) = self.current_char {
2764            match ch {
2765                '+' => {
2766                    chomping = ChompingMode::Keep;
2767                    self.advance();
2768                }
2769                '-' => {
2770                    chomping = ChompingMode::Strip;
2771                    self.advance();
2772                }
2773                '0'..='9' => {
2774                    let digit = ch.to_digit(10).unwrap() as usize;
2775                    if explicit_indent.is_some() {
2776                        let context = ErrorContext::from_input(&self.input, &self.position, 2)
2777                            .with_suggestion(
2778                                "Use only one indent indicator digit in block scalar".to_string(),
2779                            );
2780                        return Err(Error::scan_with_context(
2781                            self.position,
2782                            "Multiple indent indicators in block scalar",
2783                            context,
2784                        ));
2785                    }
2786                    // YAML 1.2 §8.1.1.1: explicit indent indicator is
2787                    // 1..=9. `|0` and `>0` are invalid
2788                    // (yaml-test-suite 2G84/00).
2789                    if digit == 0 {
2790                        let context = ErrorContext::from_input(&self.input, &self.position, 2)
2791                            .with_suggestion(
2792                                "Block-scalar indent indicator must be 1-9".to_string(),
2793                            );
2794                        return Err(Error::scan_with_context(
2795                            self.position,
2796                            "Block-scalar indent indicator `0` is invalid",
2797                            context,
2798                        ));
2799                    }
2800                    explicit_indent = Some(digit);
2801                    self.advance();
2802                }
2803                ' ' | '\t' => {
2804                    seen_separator_ws = true;
2805                    self.advance(); // Skip whitespace
2806                }
2807                '#' => {
2808                    if !seen_separator_ws {
2809                        return Err(Error::scan(
2810                            self.position,
2811                            "Comment in block-scalar header must be preceded by whitespace"
2812                                .to_string(),
2813                        ));
2814                    }
2815                    // Skip comment to end of line
2816                    while let Some(ch) = self.current_char {
2817                        self.advance();
2818                        if ch == '\n' || ch == '\r' {
2819                            break;
2820                        }
2821                    }
2822                    break;
2823                }
2824                '\n' | '\r' => break,
2825                _ => {
2826                    let context = ErrorContext::from_input(&self.input, &self.position, 2)
2827                        .with_suggestion("Use valid block scalar indicators: | (literal), > (folded), + (keep), - (strip), or digit (indent)".to_string());
2828                    return Err(Error::invalid_character_with_context(
2829                        self.position,
2830                        ch,
2831                        "block scalar header",
2832                        context,
2833                    ));
2834                }
2835            }
2836        }
2837
2838        Ok((chomping, explicit_indent))
2839    }
2840
2841    /// Advance the cursor PAST the next line break, but do not consume
2842    /// any leading whitespace on the line that follows. The block-
2843    /// scalar header parser uses this to step from the indicator line
2844    /// to the start of the content line — the next line's leading
2845    /// spaces are part of its content_indent, not header whitespace.
2846    fn skip_to_next_line(&mut self) -> Result<()> {
2847        // If we're already at column 1 (the comment handler in
2848        // scan_block_scalar_header may have already advanced past a
2849        // newline), do nothing — the next line's leading whitespace
2850        // belongs to its content_indent.
2851        if self.position.column == 1 {
2852            return Ok(());
2853        }
2854        while let Some(ch) = self.current_char {
2855            match ch {
2856                '\n' | '\r' => {
2857                    self.advance();
2858                    return Ok(());
2859                }
2860                ' ' | '\t' => {
2861                    self.advance();
2862                }
2863                _ => return Ok(()),
2864            }
2865        }
2866        Ok(())
2867    }
2868
2869    /// Find the content indentation for a block scalar.
2870    ///
2871    /// Per spec §8.1.1.1, indent is the leading-space count of the first
2872    /// non-empty content line (or the longest blank-line indent if no
2873    /// non-empty line exists). A non-empty line whose indent is not
2874    /// strictly deeper than `base_indent` is outside the scalar's
2875    /// scope — that line is a sibling structure, not content
2876    /// (yaml-test-suite K858).
2877    fn find_block_scalar_indent(&mut self, base_indent: usize) -> Result<usize> {
2878        let saved_position = self.position;
2879        let saved_char = self.current_char;
2880        let saved_char_index = self.current_char_index;
2881
2882        let mut max_blank_indent: usize = 0;
2883        let mut found = false;
2884        let mut content_indent: usize = 1;
2885
2886        loop {
2887            let mut line_indent = 0;
2888            while self.current_char == Some(' ') {
2889                line_indent += 1;
2890                self.advance();
2891            }
2892            // §6.1 + §8.1: tabs cannot serve as block-scalar
2893            // indentation. A line that BEGINS with a tab (no leading
2894            // spaces) inside the block scalar's indent search is
2895            // invalid (yaml-test-suite Y79Y/000 \`foo: |\\n\\tbar\`).
2896            // Tabs that appear AFTER one or more spaces are content,
2897            // not indentation, and remain valid (yaml-test-suite
2898            // 96NN/00 \`foo: |-\\n \\tbar\`).
2899            if line_indent == 0 && self.current_char == Some('\t') {
2900                return Err(Error::scan(
2901                    self.position,
2902                    "Tab cannot serve as block-scalar indentation".to_string(),
2903                ));
2904            }
2905
2906            match self.current_char {
2907                None => {
2908                    if line_indent > max_blank_indent {
2909                        max_blank_indent = line_indent;
2910                    }
2911                    break;
2912                }
2913                Some('\n' | '\r') => {
2914                    if line_indent > max_blank_indent {
2915                        max_blank_indent = line_indent;
2916                    }
2917                    self.advance();
2918                    // fall through to next iteration
2919                }
2920                Some(_) => {
2921                    // If we're nested inside another block — either
2922                    // via the `indent_stack` (normal mapping/sequence
2923                    // open) or `compact_sequence_indents` (a
2924                    // compact block sequence at the same indent as
2925                    // its parent) — and this candidate line is not
2926                    // strictly deeper than base_indent, it's a
2927                    // sibling outside the scalar's scope (yaml-test-
2928                    // suite K858, P2AD).
2929                    let inside_block =
2930                        self.indent_stack.len() > 1 || !self.compact_sequence_indents.is_empty();
2931                    if inside_block && line_indent <= base_indent {
2932                        content_indent = max_blank_indent.max(base_indent + 1);
2933                    } else {
2934                        content_indent = line_indent;
2935                    }
2936                    // §8.1.2.1: leading blank lines may not exceed the
2937                    // detected content indent — that ambiguity is
2938                    // invalid (yaml-test-suite W9L4, S98Z).
2939                    if max_blank_indent > content_indent {
2940                        self.position = saved_position;
2941                        self.current_char = saved_char;
2942                        self.current_char_index = saved_char_index;
2943                        return Err(Error::scan(
2944                            self.position,
2945                            "Block scalar leading blank-line indent exceeds content indent"
2946                                .to_string(),
2947                        ));
2948                    }
2949                    found = true;
2950                    break;
2951                }
2952            }
2953        }
2954
2955        if !found {
2956            content_indent = max_blank_indent;
2957        }
2958
2959        self.position = saved_position;
2960        self.current_char = saved_char;
2961        self.current_char_index = saved_char_index;
2962
2963        Ok(content_indent)
2964    }
2965
2966    /// Count indentation at start of current line
2967    fn count_line_indent(&mut self) -> usize {
2968        let mut indent = 0;
2969        let saved_position = self.position;
2970        let saved_char = self.current_char;
2971        let saved_char_index = self.current_char_index;
2972
2973        while let Some(ch) = self.current_char {
2974            if ch == ' ' {
2975                indent += 1;
2976                self.advance();
2977            } else if ch == '\t' {
2978                indent += 8; // Tab counts as 8 spaces
2979                self.advance();
2980            } else {
2981                break;
2982            }
2983        }
2984
2985        // Restore position
2986        self.position = saved_position;
2987        self.current_char = saved_char;
2988        self.current_char_index = saved_char_index;
2989
2990        indent
2991    }
2992
2993    /// Collect content for a literal block scalar.
2994    ///
2995    /// Each line is preserved with its terminating newline. After collection
2996    /// we apply the chomping mode per spec §8.1.1.2.
2997    fn collect_literal_block_content(
2998        &mut self,
2999        content_indent: usize,
3000        chomping: ChompingMode,
3001    ) -> Result<String> {
3002        let mut content = String::new();
3003
3004        loop {
3005            // Count current line's leading-space indent.
3006            let mut line_indent = 0;
3007            let save_pos = self.position;
3008            let save_ch = self.current_char;
3009            let save_idx = self.current_char_index;
3010            while self.current_char == Some(' ') {
3011                line_indent += 1;
3012                self.advance();
3013            }
3014
3015            let line_is_blank = matches!(self.current_char, Some('\n' | '\r') | None);
3016
3017            if !line_is_blank && line_indent < content_indent {
3018                // Non-empty line with less indent ends the scalar; rewind.
3019                self.position = save_pos;
3020                self.current_char = save_ch;
3021                self.current_char_index = save_idx;
3022                break;
3023            }
3024
3025            // Document marker at line start always ends the scalar,
3026            // regardless of content_indent (allows zero-indented
3027            // block scalars per yaml-test-suite FP8R).
3028            if line_indent == 0 && self.is_doc_marker_here() {
3029                self.position = save_pos;
3030                self.current_char = save_ch;
3031                self.current_char_index = save_idx;
3032                break;
3033            }
3034
3035            if line_is_blank {
3036                // A blank line counts when there's an actual line break
3037                // to consume. EOF after we've consumed some whitespace
3038                // on the trailing line ALSO counts as one final blank
3039                // line (yaml-test-suite JEF9/02: `- |+\n        `).
3040                if matches!(self.current_char, Some('\n' | '\r')) {
3041                    // Whitespace beyond content_indent is literal content
3042                    // even on blank lines (yaml-test-suite 6FWR).
3043                    for _ in content_indent..line_indent {
3044                        content.push(' ');
3045                    }
3046                    content.push('\n');
3047                    self.advance();
3048                    continue;
3049                }
3050                if line_indent > 0 {
3051                    for _ in content_indent..line_indent {
3052                        content.push(' ');
3053                    }
3054                    content.push('\n');
3055                }
3056                break;
3057            }
3058
3059            // Content line: we already consumed `line_indent` spaces, but
3060            // only `content_indent` of them belong to indentation. Any
3061            // extra leading spaces are literal content.
3062            let mut line = String::new();
3063            for _ in content_indent..line_indent {
3064                line.push(' ');
3065            }
3066            while let Some(ch) = self.current_char {
3067                if ch == '\n' || ch == '\r' {
3068                    self.advance();
3069                    break;
3070                }
3071                line.push(ch);
3072                self.advance();
3073            }
3074            content.push_str(&line);
3075            content.push('\n');
3076
3077            if self.current_char.is_none() {
3078                break;
3079            }
3080        }
3081
3082        Ok(apply_chomping(content, chomping))
3083    }
3084
3085    /// Check if cursor is at `---` or `...` followed by whitespace/EOL.
3086    fn is_doc_marker_here(&self) -> bool {
3087        let c0 = self.current_char;
3088        let c1 = self.peek_char(1);
3089        let c2 = self.peek_char(2);
3090        let c3 = self.peek_char(3);
3091        let trailing_ok = c3.map_or(true, |c| c.is_whitespace());
3092        (c0 == Some('-') && c1 == Some('-') && c2 == Some('-') && trailing_ok)
3093            || (c0 == Some('.') && c1 == Some('.') && c2 == Some('.') && trailing_ok)
3094    }
3095
3096    /// Collect content for a folded block scalar.
3097    ///
3098    /// Folding rules (§8.1.3): a sequence of single blank lines between
3099    /// equally-indented non-empty content lines collapses into a single
3100    /// space; runs of blank lines emit `n-1` newlines; more-indented
3101    /// lines preserve their newline boundaries. After collection, apply
3102    /// chomping (§8.1.1.2).
3103    fn collect_folded_block_content(
3104        &mut self,
3105        content_indent: usize,
3106        chomping: ChompingMode,
3107    ) -> Result<String> {
3108        #[derive(Clone, Copy, PartialEq, Eq)]
3109        enum LineKind {
3110            Normal,
3111            MoreIndented,
3112            Empty,
3113        }
3114        struct Line {
3115            text: String,
3116            kind: LineKind,
3117        }
3118
3119        let mut lines: Vec<Line> = Vec::new();
3120
3121        loop {
3122            let mut line_indent = 0;
3123            let save_pos = self.position;
3124            let save_ch = self.current_char;
3125            let save_idx = self.current_char_index;
3126            while self.current_char == Some(' ') {
3127                line_indent += 1;
3128                self.advance();
3129            }
3130
3131            let line_is_blank = matches!(self.current_char, Some('\n' | '\r') | None);
3132
3133            if !line_is_blank && line_indent < content_indent {
3134                self.position = save_pos;
3135                self.current_char = save_ch;
3136                self.current_char_index = save_idx;
3137                break;
3138            }
3139
3140            if line_indent == 0 && self.is_doc_marker_here() {
3141                self.position = save_pos;
3142                self.current_char = save_ch;
3143                self.current_char_index = save_idx;
3144                break;
3145            }
3146
3147            if line_is_blank {
3148                if matches!(self.current_char, Some('\n' | '\r')) {
3149                    lines.push(Line {
3150                        text: String::new(),
3151                        kind: LineKind::Empty,
3152                    });
3153                    self.advance();
3154                    continue;
3155                }
3156                break;
3157            }
3158
3159            // Capture extra-indent leading spaces as part of content.
3160            let mut text = String::new();
3161            for _ in content_indent..line_indent {
3162                text.push(' ');
3163            }
3164            while let Some(ch) = self.current_char {
3165                if ch == '\n' || ch == '\r' {
3166                    self.advance();
3167                    break;
3168                }
3169                text.push(ch);
3170                self.advance();
3171            }
3172            // §8.1.3.2: "more indented" means the content (after the
3173            // common indent strip) begins with extra whitespace —
3174            // either spaces or tabs (yaml-test-suite MJS9).
3175            let kind = if text.starts_with(' ') || text.starts_with('\t') {
3176                LineKind::MoreIndented
3177            } else {
3178                LineKind::Normal
3179            };
3180            lines.push(Line { text, kind });
3181
3182            if self.current_char.is_none() {
3183                break;
3184            }
3185        }
3186
3187        // Build the folded output.
3188        let mut content = String::new();
3189        let mut idx = 0;
3190        while idx < lines.len() {
3191            let line = &lines[idx];
3192            match line.kind {
3193                LineKind::Normal | LineKind::MoreIndented => {
3194                    content.push_str(&line.text);
3195                    // Lookahead: count immediately-following empty lines.
3196                    let mut j = idx + 1;
3197                    let mut empties = 0;
3198                    while j < lines.len() && lines[j].kind == LineKind::Empty {
3199                        empties += 1;
3200                        j += 1;
3201                    }
3202                    if j < lines.len() {
3203                        // Spec §8.1.3.2: folding behaviour depends on
3204                        // whether either surrounding content line is
3205                        // "more indented" than the content indent.
3206                        // - both Normal, 0 empties → fold to space.
3207                        // - both Normal, k empties → k newlines (one
3208                        //   break folded out).
3209                        // - any MoreIndented, 0 empties → 1 newline.
3210                        // - any MoreIndented, k empties → k+1 newlines
3211                        //   (every break preserved).
3212                        let mi_adjacent = line.kind == LineKind::MoreIndented
3213                            || lines[j].kind == LineKind::MoreIndented;
3214                        if empties == 0 {
3215                            if mi_adjacent {
3216                                content.push('\n');
3217                            } else {
3218                                content.push(' ');
3219                            }
3220                        } else {
3221                            let breaks = if mi_adjacent { empties + 1 } else { empties };
3222                            for _ in 0..breaks {
3223                                content.push('\n');
3224                            }
3225                        }
3226                        idx = j;
3227                    } else {
3228                        // End of stream after content (possibly trailing empties).
3229                        // Always emit final `\n` for the last content line; extra
3230                        // trailing empties contribute additional `\n`s, and chomping
3231                        // will trim them later if needed.
3232                        content.push('\n');
3233                        for _ in 0..empties {
3234                            content.push('\n');
3235                        }
3236                        break;
3237                    }
3238                }
3239                LineKind::Empty => {
3240                    // Leading empty lines (no preceding content): emit as `\n`s.
3241                    content.push('\n');
3242                    idx += 1;
3243                }
3244            }
3245        }
3246
3247        Ok(apply_chomping(content, chomping))
3248    }
3249
3250    /// Emit a `BlockMappingStart` token if the current position is the
3251    /// start of an implicit key and no mapping is yet active at this
3252    /// indent level. Shared by plain and quoted scalar dispatch.
3253    fn maybe_open_block_mapping_for_key(&mut self) -> Result<()> {
3254        // Use `unwrap_or(0)` for parity with the indentation module's
3255        // helpers — defends against error-recovery pop paths that could
3256        // leave the stack momentarily empty (#18).
3257        let last_indent = self.indent_stack.last().copied().unwrap_or(0);
3258        let should_start_new_mapping = if self.current_indent > last_indent {
3259            true
3260        } else if self.current_indent == last_indent {
3261            !self.check_active_mapping_at_level(self.current_indent)
3262        } else {
3263            false
3264        };
3265        if should_start_new_mapping {
3266            // §6.1 + §8.22: opening a NEW block mapping at deeper
3267            // indent than the parent only makes sense if the parent
3268            // has a key WITHOUT a value (the new mapping IS that
3269            // value). If the parent's last content is a complete
3270            // (key, value) pair — i.e. the most recent meaningful
3271            // token is a value-position scalar/alias/close — then
3272            // there's no node to host the deeper mapping (yaml-test-
3273            // suite U44R: \`map:\\n  key1: q\\n   key2: bad\` — key2
3274            // is deeper than key1 but key1's value is already \`q\`).
3275            if self.current_indent > last_indent && last_indent > 0 {
3276                let mut depth = 0i32;
3277                let mut last_meaningful = None;
3278                for t in self.tokens.iter().rev() {
3279                    match &t.token_type {
3280                        TokenType::BlockEnd => depth += 1,
3281                        TokenType::BlockMappingStart | TokenType::BlockSequenceStart => {
3282                            if depth == 0 {
3283                                break;
3284                            }
3285                            depth -= 1;
3286                        }
3287                        TokenType::Anchor(_) | TokenType::Tag(_) => {}
3288                        other => {
3289                            if depth == 0 {
3290                                last_meaningful = Some(other.clone());
3291                                break;
3292                            }
3293                        }
3294                    }
3295                }
3296                if matches!(
3297                    last_meaningful,
3298                    Some(
3299                        TokenType::Scalar(..)
3300                            | TokenType::Alias(_)
3301                            | TokenType::FlowSequenceEnd
3302                            | TokenType::FlowMappingEnd
3303                            | TokenType::BlockScalarLiteral(..)
3304                            | TokenType::BlockScalarFolded(..)
3305                    )
3306                ) {
3307                    return Err(Error::scan(
3308                        self.position,
3309                        "Indentation increase has no parent in current mapping/sequence"
3310                            .to_string(),
3311                    ));
3312                }
3313            }
3314            self.indent_stack.push(self.current_indent);
3315            self.indent_is_sequence.push(false);
3316            self.resource_tracker
3317                .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
3318            self.tokens
3319                .push(Token::simple(TokenType::BlockMappingStart, self.position));
3320        }
3321        Ok(())
3322    }
3323
3324    /// Look ahead on the current line for a `:` that marks a mapping key.
3325    ///
3326    /// Per YAML 1.2 §7.3.3, a plain scalar may contain a `:` that is not
3327    /// followed by whitespace. Only `: ` terminates the scalar. If the
3328    /// line begins with `"` or `'`, the leading quoted scalar's contents
3329    /// are scanned past (including `''` and `\"` escapes) before looking
3330    /// for the `: ` that would make this scalar a key. This handles
3331    /// yaml-test-suite 6H3V (`'foo: bar\': baz'`) and 6SLA.
3332    /// For an alias/anchor at the current position, scan past
3333    /// the `&`/`*` and the name characters; if the FIRST char that
3334    /// would terminate the name is `:`, the colon is PART of the
3335    /// alias/anchor name (yaml-test-suite 2SXE). Returns true in
3336    /// that case so the caller can skip the implicit-key fast-path.
3337    fn colon_belongs_to_alias_anchor_name(&self) -> bool {
3338        // Start after the `&` / `*` introducer.
3339        let mut i = self.current_char_index + 1;
3340        let n = self.char_cache.len();
3341        // Per scan_identifier rules: stop at whitespace or flow indicator.
3342        while i < n {
3343            let c = self.char_cache[i];
3344            if c.is_whitespace() || matches!(c, ',' | '[' | ']' | '{' | '}') {
3345                break;
3346            }
3347            i += 1;
3348        }
3349        // If the next char (or last consumed?) at termination is `:`,
3350        // then the name ended with `:`. Look at the LAST consumed
3351        // char. Actually our scan_identifier accepts `:` as part of
3352        // name — so the colon is already in the name. There's no
3353        // separate "value indicator" colon after.
3354        //
3355        // For the implicit-key fast path to be wrong, we need the
3356        // name to END with `:` (last char of name is `:`).
3357        if i > self.current_char_index + 1 {
3358            let last_name_char = self.char_cache[i - 1];
3359            if last_name_char == ':' {
3360                return true;
3361            }
3362        }
3363        false
3364    }
3365
3366    /// Scan ahead on the current line (the rest of the post-indent
3367    /// content) to determine whether it looks like an implicit
3368    /// mapping key — i.e. has a `: ` separator (or `:` at line end)
3369    /// before any newline.
3370    fn line_after_indent_is_implicit_key(&self) -> bool {
3371        let mut i = self.current_char_index;
3372        let n = self.char_cache.len();
3373        while i < n {
3374            let ch = self.char_cache[i];
3375            if ch == '\n' || ch == '\r' {
3376                return false;
3377            }
3378            if ch == ':' {
3379                let next = self.char_cache.get(i + 1).copied();
3380                if next.is_none() || next.map_or(false, |c| c.is_whitespace()) {
3381                    return true;
3382                }
3383            }
3384            i += 1;
3385        }
3386        false
3387    }
3388
3389    /// Walk back through recent tokens; if the last non-property
3390    /// token was `Value` (`:`), the parser is in value-expectation
3391    /// mode (key not yet matched with a value).
3392    fn most_recent_token_is_value_separator(&self) -> bool {
3393        for t in self.tokens.iter().rev() {
3394            match t.token_type {
3395                TokenType::Anchor(_) | TokenType::Tag(_) => {}
3396                TokenType::Value => return true,
3397                _ => return false,
3398            }
3399        }
3400        false
3401    }
3402
3403    fn check_for_mapping_ahead(&self) -> bool {
3404        let mut i = self.current_char_index;
3405        let n = self.char_cache.len();
3406        if i < n {
3407            let first = self.char_cache[i];
3408            if first == '\'' || first == '"' {
3409                let quote = first;
3410                i += 1;
3411                while i < n {
3412                    let c = self.char_cache[i];
3413                    if c == '\n' || c == '\r' {
3414                        return false; // unterminated quote on line
3415                    }
3416                    if quote == '\'' && c == '\'' && self.char_cache.get(i + 1) == Some(&'\'') {
3417                        // `''` is the in-string single-quote escape.
3418                        i += 2;
3419                        continue;
3420                    }
3421                    if quote == '"' && c == '\\' {
3422                        // Skip the escaped char.
3423                        i += 2;
3424                        continue;
3425                    }
3426                    if c == quote {
3427                        i += 1;
3428                        break;
3429                    }
3430                    i += 1;
3431                }
3432            }
3433        }
3434        // Skip balanced flow collections — a `:` *inside* `[...]` or
3435        // `{...}` does NOT make the line a block-mapping key (the flow
3436        // collection itself can BE the key, but its inner colons are
3437        // part of its own structure). yaml-test-suite: `{key: v}` is
3438        // a standalone flow mapping; `[a]: outer` is a block-map key.
3439        let mut flow_depth: i32 = 0;
3440        while i < n {
3441            let ch = self.char_cache[i];
3442            match ch {
3443                '\n' | '\r' => return false,
3444                '[' | '{' => flow_depth += 1,
3445                ']' | '}' => flow_depth -= 1,
3446                ':' if flow_depth <= 0 => {
3447                    let next = self.char_cache.get(i + 1).copied();
3448                    match next {
3449                        None => return true,
3450                        Some(c) if c.is_whitespace() => return true,
3451                        _ => {}
3452                    }
3453                }
3454                _ => {}
3455            }
3456            i += 1;
3457        }
3458        false
3459    }
3460
3461    /// Check if there's an active mapping at the specified indentation level
3462    /// This method properly handles BlockEnd tokens by tracking mapping start/end pairs
3463    fn check_active_mapping_at_level(&self, _target_indent: usize) -> bool {
3464        let mut depth = 0;
3465
3466        // Walk backwards through tokens to find the innermost unmatched block start.
3467        // Every BlockEnd increments depth; BlockMappingStart and BlockSequenceStart
3468        // decrement it (both open blocks that need a matching BlockEnd).
3469        // When depth == 0 we have found the block start that is still "open".
3470        for token in self.tokens.iter().rev() {
3471            match &token.token_type {
3472                TokenType::BlockMappingStart => {
3473                    if depth == 0 {
3474                        // The innermost open block is a mapping — active at this level.
3475                        return true;
3476                    }
3477                    depth -= 1;
3478                }
3479                TokenType::BlockSequenceStart => {
3480                    if depth == 0 {
3481                        // The innermost open block is a sequence, not a mapping.
3482                        return false;
3483                    }
3484                    depth -= 1;
3485                }
3486                TokenType::BlockEnd => {
3487                    depth += 1;
3488                }
3489                TokenType::StreamStart | TokenType::DocumentStart | TokenType::DocumentEnd => {
3490                    // Stop at document boundaries
3491                    break;
3492                }
3493                _ => {}
3494            }
3495        }
3496
3497        false
3498    }
3499}
3500
3501impl Scanner for BasicScanner {
3502    fn check_token(&self) -> bool {
3503        // For lazy scanning: check if we have cached tokens or can generate more
3504        self.token_index < self.tokens.len() || !self.done
3505    }
3506
3507    fn peek_token(&self) -> Result<Option<&Token>> {
3508        // This is a bit tricky with lazy scanning since peek shouldn't mutate
3509        // For now, return cached token if available
3510        Ok(self.tokens.get(self.token_index))
3511    }
3512
3513    fn get_token(&mut self) -> Result<Option<Token>> {
3514        // If we need more tokens and haven't finished, scan next token
3515        if self.token_index >= self.tokens.len() && !self.done {
3516            self.scan_next_token()?;
3517        }
3518
3519        if self.token_index < self.tokens.len() {
3520            let token = self.tokens[self.token_index].clone();
3521            self.token_index += 1;
3522            Ok(Some(token))
3523        } else {
3524            Ok(None)
3525        }
3526    }
3527
3528    fn reset(&mut self) {
3529        self.token_index = 0;
3530        self.position = Position::start();
3531        self.tokens.clear();
3532        self.done = false;
3533        self.current_char = self.input.chars().next();
3534        self.indent_stack = vec![0];
3535        self.current_indent = 0;
3536        self.flow_level = 0;
3537        self.detected_indent_style = None;
3538        self.indent_samples.clear();
3539        self.previous_indent_level = 0;
3540        self.current_char_index = 0;
3541        self.current_char = self.char_cache.first().copied();
3542    }
3543
3544    fn position(&self) -> Position {
3545        self.position
3546    }
3547
3548    fn input(&self) -> &str {
3549        &self.input
3550    }
3551}
3552
3553#[cfg(test)]
3554mod tests {
3555    use super::*;
3556
3557    /// Regression for #20. peek_char's negative branch must not compute
3558    /// `-offset` on `isize::MIN` — that overflows (panic in debug, wrapping
3559    /// UB in release). An out-of-range backward offset yields `None`.
3560    #[test]
3561    fn peek_char_handles_isize_min_without_overflow() {
3562        let scanner = BasicScanner::new("abc".to_string());
3563        assert_eq!(scanner.peek_char(isize::MIN), None);
3564    }
3565
3566    /// Regression for #20. The public `ScalarScanner::peek_char` takes a
3567    /// `usize`; the `BasicScanner` bridge casts it to `isize`. A `usize`
3568    /// above `isize::MAX` wraps to `isize::MIN` — it must still yield `None`,
3569    /// never a panic.
3570    #[test]
3571    fn scalar_scanner_peek_char_survives_huge_usize_offset() {
3572        let scanner = BasicScanner::new("abc".to_string());
3573        let huge = (isize::MAX as usize) + 1; // casts to isize::MIN
3574        assert_eq!(ScalarScanner::peek_char(&scanner, huge), None);
3575    }
3576
3577    /// Regression for #19. Reaching this constructor with malformed input
3578    /// must record the scanning error so callers can detect failure via
3579    /// `has_scanning_error()`. Previously the result of `scan_all_tokens`
3580    /// was dropped, silently truncating the token stream.
3581    #[test]
3582    fn new_eager_with_comments_propagates_scanning_errors() {
3583        // A doc-start marker inside an unterminated quoted scalar is a
3584        // scanning error (see `Error::scan(... "inside quoted scalar")`).
3585        // First confirm the non-comment constructor reports it — that
3586        // anchors the parity check.
3587        let input = "\"abc\n---\n";
3588        let plain = BasicScanner::new_eager(input.to_string());
3589        assert!(
3590            plain.has_scanning_error(),
3591            "precondition: malformed input must produce a scanning error via new_eager"
3592        );
3593
3594        let with_comments = BasicScanner::new_eager_with_comments(input.to_string());
3595        assert!(
3596            with_comments.has_scanning_error(),
3597            "new_eager_with_comments must NOT silently swallow scanner errors"
3598        );
3599    }
3600
3601    /// Drive the parser pipeline on `input` in a dedicated thread, returning
3602    /// `None` if it doesn't finish within `Duration::from_secs(2)`. Used by
3603    /// regression tests for parser hangs so a still-broken parser doesn't
3604    /// block the whole `cargo test` run.
3605    fn parse_with_timeout(input: &str) -> Option<Vec<crate::parser::Event>> {
3606        use crate::parser::{BasicParser, Parser as ParserTrait};
3607        use std::sync::mpsc;
3608        use std::thread;
3609        use std::time::Duration;
3610
3611        let owned = input.to_string();
3612        let (tx, rx) = mpsc::channel();
3613        thread::spawn(move || {
3614            let mut p = BasicParser::new_eager(owned);
3615            let _ = p.take_scanning_error();
3616            let mut events = Vec::new();
3617            loop {
3618                match p.get_event() {
3619                    Ok(Some(ev)) => events.push(ev),
3620                    Ok(None) => break,
3621                    Err(_) => break,
3622                }
3623            }
3624            let _ = tx.send(events);
3625        });
3626        rx.recv_timeout(Duration::from_secs(2)).ok()
3627    }
3628
3629    /// Regression: `---` directly followed by non-space text used to spin the
3630    /// scanner forever because the `-` match arm at line-start dispatched to
3631    /// `scan_document_start` (which correctly returned None) and then to
3632    /// `is_plain_scalar_start` (which returns false for `-`, so no consumption
3633    /// occurred — outer `while let` re-entered with the same char). Fix:
3634    /// fall through to `scan_plain_scalar` unconditionally when not a doc
3635    /// marker — the guard already ensures the char is non-whitespace.
3636    /// See yaml-test-suite tests 82AN / EXG3.
3637    #[test]
3638    fn three_dashes_directly_followed_by_text_does_not_hang() {
3639        let events = parse_with_timeout("---word1\nword2\n")
3640            .expect("parser hung — `---word1` should not produce an infinite loop");
3641        // We must produce at least one scalar whose value starts with `---`,
3642        // proving that the dashes were consumed as part of a plain scalar
3643        // (not interpreted as a document marker, which would consume them
3644        // separately).
3645        let starts_with_dashes = events.iter().any(|e| {
3646            matches!(&e.event_type,
3647                crate::parser::EventType::Scalar { value, .. } if value.starts_with("---")
3648            )
3649        });
3650        assert!(
3651            starts_with_dashes,
3652            "expected a plain scalar starting with `---`, got events: {events:?}"
3653        );
3654    }
3655
3656    /// YAML 1.2 §7.3.3: `?`, `:`, and `-` may start a plain scalar provided
3657    /// the next character is non-space (and, in flow context, not a flow
3658    /// indicator). The previous `is_plain_scalar_start` unconditionally
3659    /// rejected those three characters, so plain scalars like `?foo`,
3660    /// `:foo`, `-foo` were reported as `Invalid character`.
3661    /// Tracked by yaml-test-suite 2EBW.
3662    #[test]
3663    fn question_mark_followed_by_text_starts_plain_scalar() {
3664        use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3665        let mut p = BasicParser::new_eager("?foo: bar\n".to_string());
3666        assert!(p.take_scanning_error().is_none());
3667        let mut keys = Vec::new();
3668        while let Ok(Some(ev)) = p.get_event() {
3669            if let EventType::Scalar { value, .. } = ev.event_type {
3670                keys.push(value);
3671            }
3672        }
3673        assert_eq!(keys, vec!["?foo", "bar"]);
3674    }
3675
3676    #[test]
3677    fn colon_followed_by_text_starts_plain_scalar() {
3678        use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3679        let mut p = BasicParser::new_eager(":foo: bar\n".to_string());
3680        assert!(p.take_scanning_error().is_none());
3681        let mut keys = Vec::new();
3682        while let Ok(Some(ev)) = p.get_event() {
3683            if let EventType::Scalar { value, .. } = ev.event_type {
3684                keys.push(value);
3685            }
3686        }
3687        assert_eq!(keys, vec![":foo", "bar"]);
3688    }
3689
3690    /// YAML 1.2: every started document must be closed with a DocumentEnd
3691    /// event before StreamEnd. The previous `TokenType::StreamEnd` handler
3692    /// only emitted `-DOC` for `DocumentContent` / `BlockNode` states —
3693    /// the `DocumentStart` state (entered after `---` and a single scalar
3694    /// like `"foo"`) was skipped, dropping the `-DOC` event. Affected by
3695    /// yaml-test-suite 27NA, 2G84/*, 2LFX and several others.
3696    #[test]
3697    fn explicit_doc_with_only_a_scalar_emits_doc_end_before_stream_end() {
3698        use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3699        let mut p = BasicParser::new_eager("---\n\"foo\"\n".to_string());
3700        assert!(p.take_scanning_error().is_none());
3701        let mut kinds = Vec::new();
3702        while let Ok(Some(ev)) = p.get_event() {
3703            kinds.push(match ev.event_type {
3704                EventType::StreamStart => "+STR",
3705                EventType::StreamEnd => "-STR",
3706                EventType::DocumentStart { .. } => "+DOC",
3707                EventType::DocumentEnd { .. } => "-DOC",
3708                EventType::Scalar { .. } => "=VAL",
3709                _ => "?",
3710            });
3711        }
3712        // Critical: -DOC must come before -STR.
3713        let doc_end_idx = kinds.iter().position(|s| *s == "-DOC");
3714        let str_end_idx = kinds.iter().position(|s| *s == "-STR");
3715        assert!(
3716            doc_end_idx.is_some(),
3717            "missing -DOC in event stream: {kinds:?}"
3718        );
3719        assert!(
3720            doc_end_idx < str_end_idx,
3721            "expected -DOC before -STR, got {kinds:?}"
3722        );
3723    }
3724
3725    /// YAML 1.2 §5.7 hex / Unicode escapes in double-quoted strings.
3726    #[test]
3727    fn double_quoted_hex_escapes_decode_to_codepoint() {
3728        use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3729        for (input, expected) in [
3730            (r#""\x41""#, "A"),
3731            (r#""é""#, "é"),
3732            (r#""\U0001F600""#, "\u{1f600}"),
3733        ] {
3734            let mut p = BasicParser::new_eager(input.to_string());
3735            assert!(
3736                p.take_scanning_error().is_none(),
3737                "no scan error for {input}"
3738            );
3739            let mut found = None;
3740            while let Ok(Some(ev)) = p.get_event() {
3741                if let EventType::Scalar { value, .. } = ev.event_type {
3742                    found = Some(value);
3743                    break;
3744                }
3745            }
3746            assert_eq!(found.as_deref(), Some(expected), "input {input}");
3747        }
3748    }
3749
3750    #[test]
3751    fn truncated_hex_escape_is_a_scan_error() {
3752        use crate::parser::BasicParser;
3753        let mut p = BasicParser::new_eager(r#""\x4""#.to_string());
3754        assert!(
3755            p.take_scanning_error().is_some(),
3756            "truncated \\x escape must error"
3757        );
3758    }
3759
3760    /// YAML 1.2 §5.7: double-quoted strings have a strict allowlist of escape
3761    /// sequences. `\.` (and any other unknown escape) must be reported as a
3762    /// scan error. Tracked by yaml-test-suite 55WF.
3763    #[test]
3764    fn invalid_double_quoted_escape_is_a_scan_error() {
3765        use crate::parser::{BasicParser, Parser as ParserTrait};
3766        let mut p = BasicParser::new_eager("---\n\"\\.\"\n".to_string());
3767        let scan_err = p.take_scanning_error();
3768        let mut parse_err = false;
3769        if scan_err.is_none() {
3770            loop {
3771                match p.get_event() {
3772                    Ok(Some(_)) => {}
3773                    Ok(None) => break,
3774                    Err(_) => {
3775                        parse_err = true;
3776                        break;
3777                    }
3778                }
3779            }
3780        }
3781        assert!(
3782            scan_err.is_some() || parse_err,
3783            "`\\.` is not a valid double-quoted escape and must error"
3784        );
3785    }
3786
3787    /// YAML 1.2: a complex-key marker (`?`) is the first content after an
3788    /// explicit document start (`---`) — it should open an implicit block
3789    /// mapping. The previous parser handled `?` only in
3790    /// `ImplicitDocumentStart` / `DocumentContent` / already-in-mapping
3791    /// states and errored out for `DocumentStart`, breaking inputs like
3792    /// `--- !!set\n? Mark McGwire\n...`. Tracked by yaml-test-suite 2XXW.
3793    #[test]
3794    fn complex_key_directly_after_explicit_doc_start_opens_mapping() {
3795        use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3796        let mut p = BasicParser::new_eager("--- !!set\n? Mark McGwire\n? Sammy Sosa\n".to_string());
3797        assert!(p.take_scanning_error().is_none());
3798        let mut saw_map_start = false;
3799        let mut saw_error = false;
3800        loop {
3801            match p.get_event() {
3802                Ok(Some(ev)) => {
3803                    if matches!(ev.event_type, EventType::MappingStart { .. }) {
3804                        saw_map_start = true;
3805                    }
3806                }
3807                Ok(None) => break,
3808                Err(_) => {
3809                    saw_error = true;
3810                    break;
3811                }
3812            }
3813        }
3814        assert!(!saw_error, "complex key after `--- !!set` must not error");
3815        assert!(saw_map_start, "expected a MappingStart event");
3816    }
3817
3818    /// YAML 1.2 §6.9.2: anchor / alias names exclude only whitespace and
3819    /// the flow indicators `,[]{}`. Earlier implementations restricted
3820    /// `scan_identifier` to ASCII alphanumeric / `_` / `-`, which rejected
3821    /// valid unicode anchors like `&😁`. Tracked by yaml-test-suite 8XYN.
3822    #[test]
3823    fn anchor_name_may_contain_unicode_symbols() {
3824        use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3825        let mut p = BasicParser::new_eager("---\n- &😁 unicode anchor\n".to_string());
3826        assert!(
3827            p.take_scanning_error().is_none(),
3828            "unicode anchor must not error"
3829        );
3830        let mut anchors = Vec::new();
3831        while let Ok(Some(ev)) = p.get_event() {
3832            if let EventType::Scalar {
3833                anchor: Some(a), ..
3834            } = ev.event_type
3835            {
3836                anchors.push(a);
3837            }
3838        }
3839        assert_eq!(anchors, vec!["😁"]);
3840    }
3841
3842    /// YAML 1.2 §5.6 / RFC 3986 percent-encoding: tag suffixes may contain
3843    /// `%XX` percent-escaped characters, which must be URI-decoded when
3844    /// resolved. The scanner used to reject `%` in tag suffixes as
3845    /// "Invalid character", so e.g. `!e!tag%21 baz` failed before the
3846    /// resolver got a chance to decode it. Tracked by yaml-test-suite 6CK3.
3847    #[test]
3848    fn tag_suffix_with_percent_escape_resolves_to_decoded_uri() {
3849        use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3850        let mut p = BasicParser::new_eager(
3851            "%TAG !e! tag:example.com,2000:app/\n---\n- !e!tag%21 baz\n".to_string(),
3852        );
3853        assert!(
3854            p.take_scanning_error().is_none(),
3855            "tag percent-escapes must not error"
3856        );
3857        let mut tags = Vec::new();
3858        while let Ok(Some(ev)) = p.get_event() {
3859            if let EventType::Scalar { tag: Some(t), .. } = ev.event_type {
3860                tags.push(t);
3861            }
3862        }
3863        assert_eq!(tags, vec!["tag:example.com,2000:app/tag!"]);
3864    }
3865
3866    /// YAML 1.2 §6.8.4: "A YAML processor should ignore any directive it
3867    /// does not recognize." A `%FOO` reserved directive must NOT be treated
3868    /// as a scan error — the directive line is silently skipped and parsing
3869    /// continues. Tracked by yaml-test-suite test 2LFX.
3870    #[test]
3871    fn reserved_directive_is_ignored_not_an_error() {
3872        use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3873        let mut p = BasicParser::new_eager(
3874            "%FOO  bar baz # Should be ignored\n              # with a warning.\n---\n\"foo\"\n"
3875                .to_string(),
3876        );
3877        assert!(
3878            p.take_scanning_error().is_none(),
3879            "unknown directives must NOT produce a scan error"
3880        );
3881        let mut scalars = Vec::new();
3882        while let Ok(Some(ev)) = p.get_event() {
3883            if let EventType::Scalar { value, .. } = ev.event_type {
3884                scalars.push(value);
3885            }
3886        }
3887        assert_eq!(scalars, vec!["foo"]);
3888    }
3889
3890    /// Spec requires the two physical lines of `---word1\nword2` to fold into
3891    /// a single plain scalar `"---word1 word2"`. Tracked by yaml-test-suite 82AN.
3892    #[test]
3893    fn three_dashes_followed_by_text_folds_continuation_line() {
3894        let events = parse_with_timeout("---word1\nword2\n").expect("parser hung");
3895        let scalars: Vec<&str> = events
3896            .iter()
3897            .filter_map(|e| match &e.event_type {
3898                crate::parser::EventType::Scalar { value, .. } => Some(value.as_str()),
3899                _ => None,
3900            })
3901            .collect();
3902        assert_eq!(scalars, vec!["---word1 word2"]);
3903    }
3904
3905    /// Regression: tab between block-entry marker and a `-N` value used to
3906    /// hang the scanner via the same `-` match arm. See yaml-test-suite
3907    /// Y79Y/010.
3908    #[test]
3909    fn dash_tab_negative_number_does_not_hang() {
3910        let events = parse_with_timeout("-\t-1\n")
3911            .expect("parser hung — `-\\t-1` should not produce an infinite loop");
3912        assert!(!events.is_empty(), "expected event stream, got none");
3913    }
3914
3915    #[test]
3916    fn test_basic_tokenization() {
3917        let mut scanner = BasicScanner::new("42".to_string());
3918
3919        assert!(scanner.check_token());
3920
3921        // StreamStart
3922        let token = scanner.get_token().unwrap().unwrap();
3923        assert!(matches!(token.token_type, TokenType::StreamStart));
3924
3925        // Number
3926        let token = scanner.get_token().unwrap().unwrap();
3927        if let TokenType::Scalar(value, _) = token.token_type {
3928            assert_eq!(value, "42");
3929        } else {
3930            panic!("Expected scalar token");
3931        }
3932
3933        // StreamEnd
3934        let token = scanner.get_token().unwrap().unwrap();
3935        assert!(matches!(token.token_type, TokenType::StreamEnd));
3936    }
3937
3938    #[test]
3939    fn test_flow_sequence() {
3940        let mut scanner = BasicScanner::new("[1, 2, 3]".to_string());
3941
3942        // StreamStart
3943        scanner.get_token().unwrap();
3944
3945        // [
3946        let token = scanner.get_token().unwrap().unwrap();
3947        assert!(matches!(token.token_type, TokenType::FlowSequenceStart));
3948
3949        // 1
3950        let token = scanner.get_token().unwrap().unwrap();
3951        if let TokenType::Scalar(value, _) = token.token_type {
3952            assert_eq!(value, "1");
3953        }
3954
3955        // ,
3956        let token = scanner.get_token().unwrap().unwrap();
3957        assert!(matches!(token.token_type, TokenType::FlowEntry));
3958    }
3959
3960    #[test]
3961    fn test_quoted_strings() {
3962        let mut scanner = BasicScanner::new(r#""hello world""#.to_string());
3963
3964        // StreamStart
3965        scanner.get_token().unwrap();
3966
3967        // Quoted string
3968        let token = scanner.get_token().unwrap().unwrap();
3969        if let TokenType::Scalar(value, _) = token.token_type {
3970            assert_eq!(value, "hello world");
3971        } else {
3972            panic!("Expected scalar token");
3973        }
3974    }
3975
3976    #[test]
3977    fn test_comment_handling() {
3978        let input = r"
3979# Full line comment
3980key: value  # End of line comment
3981# Another comment
3982data: test
3983";
3984        let mut scanner = BasicScanner::new(input.to_string());
3985
3986        let mut tokens = Vec::new();
3987        while let Ok(Some(token)) = scanner.get_token() {
3988            tokens.push(token);
3989        }
3990
3991        // Should only contain YAML structure tokens, no comment tokens
3992        let scalar_values: Vec<String> = tokens
3993            .iter()
3994            .filter_map(|t| match &t.token_type {
3995                TokenType::Scalar(s, _) => Some(s.clone()),
3996                _ => None,
3997            })
3998            .collect();
3999
4000        assert_eq!(scalar_values, vec!["key", "value", "data", "test"]);
4001
4002        // Should not contain any comment tokens
4003        assert!(
4004            !tokens
4005                .iter()
4006                .any(|t| matches!(t.token_type, TokenType::Comment(_)))
4007        );
4008    }
4009
4010    #[test]
4011    fn test_hash_in_strings() {
4012        let input = r#"
4013string1: "This has a # character"
4014string2: 'Also has # character'
4015normal: value # This is a comment
4016"#;
4017        let mut scanner = BasicScanner::new(input.to_string());
4018
4019        let mut scalar_values = Vec::new();
4020        while let Ok(Some(token)) = scanner.get_token() {
4021            if let TokenType::Scalar(value, _) = token.token_type {
4022                scalar_values.push(value);
4023            }
4024        }
4025
4026        assert!(scalar_values.contains(&"This has a # character".to_string()));
4027        assert!(scalar_values.contains(&"Also has # character".to_string()));
4028        assert!(scalar_values.contains(&"value".to_string()));
4029        assert!(
4030            !scalar_values
4031                .iter()
4032                .any(|s| s.contains("This is a comment"))
4033        );
4034    }
4035
4036    #[test]
4037    fn test_escape_sequences() {
4038        // YAML 1.2 §5.7 double-quoted escape sequences. Single-quoted strings
4039        // have NO backslash escapes — `''` is the only escape — so this set
4040        // is restricted to the double-quoted cases.
4041        let test_cases = vec![
4042            (r#""Line 1\nLine 2""#, "Line 1\nLine 2"),
4043            (r#""Col1\tCol2""#, "Col1\tCol2"),
4044            (r#""First\rSecond""#, "First\rSecond"),
4045            (r#""Path\\to\\file""#, "Path\\to\\file"),
4046            (r#""He said \"Hello\"""#, "He said \"Hello\""),
4047        ];
4048
4049        for (input, expected) in test_cases {
4050            let mut scanner = BasicScanner::new(input.to_string());
4051            scanner.get_token().unwrap(); // Skip StreamStart
4052
4053            if let Ok(Some(token)) = scanner.get_token() {
4054                if let TokenType::Scalar(value, _) = token.token_type {
4055                    assert_eq!(value, expected, "Failed for input: {}", input);
4056                } else {
4057                    panic!("Expected scalar token for input: {}", input);
4058                }
4059            } else {
4060                panic!("Failed to get token for input: {}", input);
4061            }
4062        }
4063    }
4064
4065    #[test]
4066    fn test_extended_yaml_escapes() {
4067        // Test additional YAML escape sequences
4068        let test_cases = vec![
4069            (r#""\0""#, "\0"),   // null character
4070            (r#""\a""#, "\x07"), // bell
4071            (r#""\b""#, "\x08"), // backspace
4072            (r#""\f""#, "\x0C"), // form feed
4073            (r#""\v""#, "\x0B"), // vertical tab
4074            (r#""\e""#, "\x1B"), // escape
4075            (r#""\ ""#, " "),    // literal space
4076            (r#""\/""#, "/"),    // literal forward slash
4077        ];
4078
4079        for (input, expected) in test_cases {
4080            let mut scanner = BasicScanner::new(input.to_string());
4081            scanner.get_token().unwrap(); // Skip StreamStart
4082
4083            if let Ok(Some(token)) = scanner.get_token() {
4084                if let TokenType::Scalar(value, _) = token.token_type {
4085                    assert_eq!(value, expected, "Failed for input: {}", input);
4086                } else {
4087                    panic!("Expected scalar token for input: {}", input);
4088                }
4089            } else {
4090                panic!("Failed to get token for input: {}", input);
4091            }
4092        }
4093    }
4094
4095    #[test]
4096    fn test_unknown_escape_sequences() {
4097        // YAML 1.2 §5.7: unknown double-quoted escapes are scan errors, not
4098        // preserved literals. (Earlier versions of this scanner kept the
4099        // backslash + char verbatim — see commit history.)
4100        for input in [r#""\z""#, r#""\q""#, r#""\8""#] {
4101            let mut scanner = BasicScanner::new(input.to_string());
4102            scanner.get_token().unwrap(); // StreamStart
4103            assert!(
4104                scanner.get_token().is_err(),
4105                "expected scan error for invalid escape in {input}"
4106            );
4107        }
4108    }
4109}