rust_yaml/scanner/mod.rs
1//! YAML scanner for tokenization
2
3use crate::{Error, Limits, Position, ResourceTracker, Result, error::ErrorContext};
4
5pub mod indentation;
6pub mod scalar_scanner;
7pub mod state;
8pub mod token_processor;
9pub mod tokens;
10// pub mod optimizations; // Temporarily disabled
11pub use scalar_scanner::ScalarScanner;
12pub use tokens::*;
13// pub use optimizations::*;
14
15/// Trait for YAML scanners that convert character streams to tokens
16pub trait Scanner {
17 /// Check if there are more tokens available
18 fn check_token(&self) -> bool;
19
20 /// Peek at the next token without consuming it
21 fn peek_token(&self) -> Result<Option<&Token>>;
22
23 /// Get the next token, consuming it
24 fn get_token(&mut self) -> Result<Option<Token>>;
25
26 /// Reset the scanner state
27 fn reset(&mut self);
28
29 /// Get the current position in the input
30 fn position(&self) -> Position;
31
32 /// Get the input text for error reporting
33 fn input(&self) -> &str;
34}
35
36/// Block-scalar chomping mode per YAML 1.2 §8.1.1.2.
37///
38/// - `Strip` (`-`): drop the final line break and trailing empty lines.
39/// - `Clip` (default): keep exactly one final line break, drop trailing empty lines.
40/// - `Keep` (`+`): preserve the final line break and all trailing empty lines.
41#[derive(Debug, Clone, Copy, PartialEq, Eq)]
42enum ChompingMode {
43 Strip,
44 Clip,
45 Keep,
46}
47
48/// Apply chomping mode to a block-scalar tail.
49///
50/// The collectors emit a `\n` for every line (content or blank). This helper
51/// trims that tail according to spec §8.1.1.2:
52///
53/// - **Strip:** remove every trailing `\n`.
54/// - **Clip:** keep exactly one trailing `\n` if content exists; drop the rest.
55/// Empty input stays empty.
56/// - **Keep:** preserve everything.
57fn apply_chomping(mut s: String, mode: ChompingMode) -> String {
58 match mode {
59 ChompingMode::Keep => s,
60 ChompingMode::Strip => {
61 while s.ends_with('\n') {
62 s.pop();
63 }
64 s
65 }
66 ChompingMode::Clip => {
67 // Strip trailing newlines. If anything remains, restore one.
68 // §8.1.1.2: clip keeps the final line break only when the
69 // scalar has actual content (yaml-test-suite K858: an empty
70 // clip scalar `>` is `""`, not `"\n"`).
71 while s.ends_with('\n') {
72 s.pop();
73 }
74 if !s.is_empty() {
75 s.push('\n');
76 }
77 s
78 }
79 }
80}
81
82/// A basic scanner implementation for YAML tokenization
83#[derive(Debug)]
84#[allow(dead_code)]
85pub struct BasicScanner {
86 input: String,
87 position: Position,
88 current_char: Option<char>,
89 tokens: Vec<Token>,
90 token_index: usize,
91 done: bool,
92 indent_stack: Vec<usize>,
93 current_indent: usize,
94 allow_simple_key: bool,
95 simple_key_allowed: bool,
96 flow_level: usize,
97 preserve_comments: bool,
98 // Indentation style detection
99 detected_indent_style: Option<crate::value::IndentStyle>,
100 indent_samples: Vec<(usize, bool)>, // (size, is_tabs)
101 previous_indent_level: usize, // Track the previous indentation for style detection
102 // Performance optimizations
103 buffer: String, // Reusable string buffer for token values
104 char_cache: Vec<char>, // Cached characters for faster access
105 char_indices: Vec<(usize, char)>, // Cached character indices for O(1) lookups
106 current_char_index: usize, // Current index in char_cache
107 profiler: Option<crate::profiling::YamlProfiler>, // Optional profiling
108 // Error tracking
109 scanning_error: Option<Error>, // Store scanning errors for later retrieval
110 // Resource tracking
111 limits: Limits,
112 resource_tracker: ResourceTracker,
113 // Track inline nested sequences that need closing
114 inline_sequence_depth: usize,
115 // Track compact-notation sequences (where `-` is at the same indent as
116 // the parent mapping keys). These are NOT on indent_stack, so we need
117 // separate tracking to know when to emit BlockEnd for them.
118 compact_sequence_indents: Vec<usize>,
119 // Parallel to indent_stack: true when the entry was pushed by a block
120 // sequence, false when by a mapping. Lets us distinguish "continuing a
121 // regular sequence" from "starting a compact sequence at same indent".
122 indent_is_sequence: Vec<bool>,
123}
124
125impl BasicScanner {
126 /// Create a new scanner from input string
127 pub fn new(input: String) -> Self {
128 Self::with_limits(input, Limits::default())
129 }
130
131 /// Create a new scanner with custom resource limits
132 pub fn with_limits(input: String, limits: Limits) -> Self {
133 let char_cache: Vec<char> = input.chars().collect();
134 let char_indices: Vec<(usize, char)> = input.char_indices().collect();
135 let current_char = char_cache.first().copied();
136
137 // Track document size for resource limits
138 let mut resource_tracker = ResourceTracker::new();
139 if let Err(e) = resource_tracker.add_bytes(&limits, input.len()) {
140 // If the input is too large, create scanner with error state
141 return Self {
142 current_char: None,
143 input,
144 position: Position::start(),
145 tokens: Vec::new(),
146 token_index: 0,
147 done: true,
148 indent_stack: vec![0],
149 current_indent: 0,
150 allow_simple_key: false,
151 simple_key_allowed: false,
152 flow_level: 0,
153 preserve_comments: false,
154 detected_indent_style: None,
155 indent_samples: Vec::new(),
156 previous_indent_level: 0,
157 buffer: String::new(),
158 char_cache: Vec::new(),
159 char_indices: Vec::new(),
160 current_char_index: 0,
161 profiler: None,
162 scanning_error: Some(e),
163 limits,
164 resource_tracker,
165 inline_sequence_depth: 0,
166 compact_sequence_indents: Vec::new(),
167 indent_is_sequence: vec![false],
168 };
169 }
170
171 Self {
172 current_char,
173 input,
174 position: Position::start(),
175 tokens: Vec::new(),
176 token_index: 0,
177 done: false,
178 indent_stack: vec![0], // Always start with base indentation
179 current_indent: 0,
180 allow_simple_key: true,
181 simple_key_allowed: true,
182 flow_level: 0,
183 preserve_comments: false,
184 detected_indent_style: None,
185 indent_samples: Vec::new(),
186 previous_indent_level: 0,
187 buffer: String::with_capacity(64), // Pre-allocate buffer
188 char_cache,
189 char_indices,
190 current_char_index: 0,
191 profiler: std::env::var("RUST_YAML_PROFILE")
192 .ok()
193 .map(|_| crate::profiling::YamlProfiler::new()),
194 scanning_error: None,
195 limits,
196 resource_tracker,
197 inline_sequence_depth: 0,
198 compact_sequence_indents: Vec::new(),
199 indent_is_sequence: vec![false],
200 }
201 }
202
203 /// Create a new scanner with eager token scanning (for compatibility)
204 pub fn new_eager(input: String) -> Self {
205 Self::new_eager_with_limits(input, Limits::default())
206 }
207
208 /// Create a new scanner with eager token scanning and custom limits
209 pub fn new_eager_with_limits(input: String, limits: Limits) -> Self {
210 let mut scanner = Self::with_limits(input, limits);
211 // Store any scanning errors for later retrieval
212 if let Err(error) = scanner.scan_all_tokens() {
213 scanner.scanning_error = Some(error);
214 }
215 scanner
216 }
217
218 /// Create a new scanner with comment preservation enabled
219 pub fn new_with_comments(input: String) -> Self {
220 let mut scanner = Self::new(input);
221 scanner.preserve_comments = true;
222 scanner
223 }
224
225 /// Create a new scanner with comments and custom limits
226 pub fn new_with_comments_and_limits(input: String, limits: Limits) -> Self {
227 let mut scanner = Self::with_limits(input, limits);
228 scanner.preserve_comments = true;
229 scanner
230 }
231
232 /// Create a new scanner with eager scanning and comment preservation
233 pub fn new_eager_with_comments(input: String) -> Self {
234 let mut scanner = Self::new_with_comments(input);
235 // Mirror `new_eager_with_limits`: record scanning errors instead
236 // of discarding them (#19). Previously this used
237 // `unwrap_or(())`, silently truncating the token stream and
238 // returning a scanner whose `has_scanning_error()` reported
239 // false — silent data loss for comment-preserving callers.
240 if let Err(error) = scanner.scan_all_tokens() {
241 scanner.scanning_error = Some(error);
242 }
243 scanner
244 }
245
246 /// Get the detected indentation style from the document
247 pub const fn detected_indent_style(&self) -> Option<&crate::value::IndentStyle> {
248 self.detected_indent_style.as_ref()
249 }
250
251 /// Check if there was a scanning error
252 pub const fn has_scanning_error(&self) -> bool {
253 self.scanning_error.is_some()
254 }
255
256 /// Get the scanning error if any
257 #[allow(clippy::missing_const_for_fn)]
258 pub fn take_scanning_error(&mut self) -> Option<Error> {
259 self.scanning_error.take()
260 }
261
262 /// Advance to the next character
263 fn advance(&mut self) -> Option<char> {
264 if let Some(ch) = self.current_char {
265 self.position = self.position.advance(ch);
266 self.current_char_index += 1;
267
268 if self.current_char_index < self.char_cache.len() {
269 self.current_char = Some(self.char_cache[self.current_char_index]);
270 } else {
271 self.current_char = None;
272 }
273 }
274
275 self.current_char
276 }
277
278 /// Skip whitespace characters (excluding newlines)
279 fn skip_whitespace(&mut self) {
280 while let Some(ch) = self.current_char {
281 if ch == ' ' || ch == '\t' {
282 self.advance();
283 } else {
284 break;
285 }
286 }
287 }
288
289 /// Handle indentation and produce block tokens if necessary
290 fn handle_indentation(&mut self) -> Result<()> {
291 // In flow context: if there is a non-trivial enclosing block
292 // (indent_stack has more than the implicit root level), each
293 // continuation line that has content must be indented MORE than
294 // that enclosing block's indent. \`flow: [a,\\nb,c]\` with \`b\`
295 // at col 1 violates this rule because the block mapping enclosing
296 // \`flow:\` sits at indent 0 (yaml-test-suite 9C9N).
297 //
298 // Top-level flow (no enclosing block; indent_stack is just \[0\])
299 // is exempt — `[a,\\nb]` is fine there because the flow content
300 // isn't nested inside any block (yaml-test-suite 4ZYM).
301 if self.flow_level > 0 {
302 if self.indent_stack.len() > 1 || !self.compact_sequence_indents.is_empty() {
303 let mut probe = 0usize;
304 let mut i = self.current_char_index;
305 while i < self.char_cache.len() {
306 match self.char_cache[i] {
307 ' ' => {
308 probe += 1;
309 i += 1;
310 }
311 '\t' => i += 1,
312 _ => break,
313 }
314 }
315 let has_content = self
316 .char_cache
317 .get(i)
318 .map_or(false, |c| !matches!(c, '\n' | '\r'));
319 // A line that begins with the matching flow closer
320 // (\`]\` / \`}\`) is allowed at the parent indent — it
321 // closes the flow collection, not adds content
322 // (yaml-test-suite NKF9 trailing-line \`}\` at col 1).
323 let is_closer = matches!(self.char_cache.get(i).copied(), Some(']' | '}'));
324 if has_content && !is_closer {
325 let parent_indent = self.indent_stack.last().copied().unwrap_or(0);
326 if probe <= parent_indent {
327 return Err(Error::scan(
328 self.position,
329 "Flow content line is not indented enough".to_string(),
330 ));
331 }
332 }
333 }
334 return Ok(());
335 }
336
337 let line_start_pos = self.position;
338 let mut indent = 0;
339 let mut has_tabs = false;
340 let mut has_spaces = false;
341 let _indent_start_pos = self.position;
342
343 // Count indentation and detect style
344 while let Some(ch) = self.current_char {
345 if ch == ' ' {
346 indent += 1;
347 has_spaces = true;
348 self.advance();
349 } else if ch == '\t' {
350 indent += 8; // Tab counts as 8 spaces for indentation calculation
351 has_tabs = true;
352 self.advance();
353 } else {
354 break;
355 }
356 }
357
358 // Analyze indentation pattern for style detection
359 // Only analyze if there's actual content after the indentation (not just whitespace)
360 if indent > 0
361 && self.current_char.is_some()
362 && !matches!(self.current_char, Some('\n' | '\r'))
363 {
364 self.analyze_indentation_pattern(indent, has_tabs, has_spaces)?;
365 }
366
367 // YAML 1.2 §6.1 does NOT require all indents to be multiples
368 // of a single "indent width". Siblings must share a column;
369 // children must indent further; but any positive amount works
370 // (e.g. `key:\n child:\n grandchild:` with widths 2, 1
371 // is legal). The earlier strict-multiple-of-N check rejected
372 // valid spec fixtures like 6HB6, 8G76, A2M4, P94K, Q9WF,
373 // UGM3. We rely on the indent_stack-driven open/close logic
374 // (and the per-block "more than parent" rule enforced
375 // elsewhere) to catch genuine mis-indentation.
376
377 // Update previous indentation level for future comparisons
378 if indent > 0 {
379 self.previous_indent_level = indent;
380 }
381
382 // Update current indentation level
383 self.current_indent = indent;
384
385 // Close compact-notation sequences whose scope ends at this line.
386 // A compact sequence (where `-` shares the indent of the parent
387 // mapping keys) ends when the next content line at that indent is
388 // NOT a block entry (`- `). We must emit the sequence's BlockEnd
389 // BEFORE popping the indent_stack so that the nesting order is
390 // correct (sequence closes before its parent mapping).
391 let has_content =
392 self.current_char.is_some() && !matches!(self.current_char, Some('\n' | '\r' | '#'));
393 if has_content {
394 let is_block_entry = self.current_char == Some('-')
395 && self.peek_char(1).map_or(true, |c| c.is_whitespace());
396 while let Some(&seq_indent) = self.compact_sequence_indents.last() {
397 if indent < seq_indent || (indent == seq_indent && !is_block_entry) {
398 self.compact_sequence_indents.pop();
399 self.tokens
400 .push(Token::simple(TokenType::BlockEnd, line_start_pos));
401 } else {
402 break;
403 }
404 }
405 }
406
407 // Check if we need to emit block end tokens for decreased indentation
408 let pre_pop_top = self.indent_stack.last().copied().unwrap_or(0);
409 while let Some(&last_indent) = self.indent_stack.last() {
410 if indent < last_indent && last_indent > 0 {
411 self.indent_stack.pop();
412 self.indent_is_sequence.pop();
413 self.tokens
414 .push(Token::simple(TokenType::BlockEnd, line_start_pos));
415 } else {
416 break;
417 }
418 }
419
420 // §6.1: after a dedent, the new line's indent must match some
421 // existing container level — keys/items at a sibling level
422 // must share a column. Landing at a column that is between
423 // two stack levels (e.g. parent at 0, just-closed at 3, new
424 // line at 1) is invalid because no open mapping/sequence sits
425 // at indent 1 (yaml-test-suite DMG6, N4JP).
426 //
427 // The check applies only when:
428 // * we actually dedented (pre-pop top was deeper than now),
429 // * the new line has content (the next char is not blank /
430 // newline / EOF / comment),
431 // * indent doesn't match the new top.
432 if pre_pop_top > 0
433 && pre_pop_top > self.indent_stack.last().copied().unwrap_or(0)
434 && self
435 .current_char
436 .map_or(false, |c| !matches!(c, '\n' | '\r' | '#'))
437 && indent != self.indent_stack.last().copied().unwrap_or(0)
438 {
439 // Allow if indent is a valid deeper level — e.g.
440 // sibling at depth then deeper child — but for the
441 // dedent path indent must equal a known stack level.
442 return Err(Error::scan(
443 self.position,
444 format!(
445 "Indentation {indent} doesn't match any open container (expected {} or deeper)",
446 self.indent_stack.last().copied().unwrap_or(0)
447 ),
448 ));
449 }
450
451 Ok(())
452 }
453
454 /// Analyze indentation pattern to detect the document's indentation style
455 fn analyze_indentation_pattern(
456 &mut self,
457 current_indent: usize,
458 has_tabs: bool,
459 has_spaces: bool,
460 ) -> Result<()> {
461 // Prevent mixed indentation (tabs + spaces on same line).
462 // Carve-out: a tab AFTER one or more spaces and BEFORE
463 // value-position content (not a key) is content-area
464 // whitespace, not indentation. \`foo:\\n \\tbar\` — the 1
465 // space is indent, the tab is a separator before \`bar\`
466 // which is the value of \`foo:\` (yaml-test-suite DK95/00).
467 if has_tabs && has_spaces {
468 // Peek ahead: if the content after the tab+spaces area
469 // contains a key marker (`: ` or `:`+EOL), treat as
470 // indentation (invalid). Otherwise it's a value line.
471 let looks_like_key = self.line_after_indent_is_implicit_key();
472 if looks_like_key {
473 let context =
474 crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
475 .with_suggestion(
476 "Use either tabs OR spaces for indentation, not both".to_string(),
477 );
478 return Err(Error::invalid_character_with_context(
479 self.position,
480 '\t',
481 "mixed indentation",
482 context,
483 ));
484 }
485 }
486 // §6.1: indentation must be space characters only. Pure-tab
487 // indentation (\`\\tkey: value\`) is invalid (yaml-test-suite
488 // 4EJS). Two carve-outs:
489 // * The mixed case is caught by the earlier branch.
490 // * Tabs before a flow-collection opener (\`\\t[\`, \`\\t{\`)
491 // at the root are not "block indentation" — there's no
492 // enclosing block — and yaml-test-suite 6CA3 / Q5MG accept
493 // them.
494 if has_tabs && !has_spaces && !matches!(self.current_char, Some('[' | '{')) {
495 let context = crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
496 .with_suggestion("Use space characters for indentation".to_string());
497 return Err(Error::invalid_character_with_context(
498 self.position,
499 '\t',
500 "indentation",
501 context,
502 ));
503 }
504
505 // If we detected tabs, check for mixed indentation across lines
506 if has_tabs {
507 match self.detected_indent_style {
508 None => {
509 // First time detecting indentation style - set to tabs
510 self.detected_indent_style = Some(crate::value::IndentStyle::Tabs);
511 }
512 Some(crate::value::IndentStyle::Spaces(_)) => {
513 // Previously detected spaces, now seeing tabs - mixed indentation error
514 let context =
515 crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
516 .with_suggestion(
517 "Use consistent indentation style throughout the document"
518 .to_string(),
519 );
520 return Err(Error::invalid_character_with_context(
521 self.position,
522 '\t',
523 "mixed indentation",
524 context,
525 ));
526 }
527 Some(crate::value::IndentStyle::Tabs) => {
528 // Already using tabs - this is consistent
529 }
530 }
531 return Ok(());
532 }
533
534 // For spaces, check for mixed indentation across lines first
535 if has_spaces {
536 // Check if we previously detected tabs
537 if matches!(
538 self.detected_indent_style,
539 Some(crate::value::IndentStyle::Tabs)
540 ) {
541 let context =
542 crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
543 .with_suggestion(
544 "Use consistent indentation style throughout the document".to_string(),
545 );
546 return Err(Error::invalid_character_with_context(
547 self.position,
548 ' ',
549 "mixed indentation",
550 context,
551 ));
552 }
553
554 // Calculate the indentation level difference
555 if current_indent > self.previous_indent_level {
556 let indent_diff = current_indent - self.previous_indent_level;
557
558 // Store this sample for analysis (but only meaningful differences)
559 if indent_diff > 0 && indent_diff <= 8 {
560 // Reasonable indentation range
561 self.indent_samples.push((indent_diff, false));
562
563 // Try to determine consistent indentation width
564 if self.detected_indent_style.is_none() {
565 self.detect_space_indentation_width();
566 }
567 }
568 }
569
570 // YAML 1.2 §6.1 does NOT require all indents to be multiples
571 // of a single "indent width". Sibling lines must share a
572 // column and children must indent deeper than parents, but
573 // any positive amount works. The "multiple of N" check
574 // rejected valid spec fixtures (6HB6, M5C3, P94K, Q9WF,
575 // RZP5, UGM3, XW4D, A2M4); we rely on the indent_stack
576 // open/close logic for genuine mis-indentation. The detected
577 // style is still recorded for later style-preservation use
578 // (e.g. emitter), it just no longer drives validation.
579 // self.validate_indentation_consistency(current_indent)?;
580 }
581
582 Ok(())
583 }
584
585 /// Detect the consistent space indentation width from samples
586 fn detect_space_indentation_width(&mut self) {
587 if self.indent_samples.is_empty() {
588 return; // Need at least 1 sample
589 }
590
591 // Find the most common indentation width
592 let mut width_counts = std::collections::HashMap::new();
593
594 for &(width, is_tabs) in &self.indent_samples {
595 if !is_tabs && width > 0 {
596 *width_counts.entry(width).or_insert(0) += 1;
597 }
598 }
599
600 // Find the most frequent width - be more aggressive and detect early
601 if let Some((&most_common_width, &_count)) =
602 width_counts.iter().max_by_key(|&(_, count)| count)
603 {
604 // Set on first consistent sample to enable stricter validation
605 self.detected_indent_style = Some(crate::value::IndentStyle::Spaces(most_common_width));
606 }
607 }
608
609 /// Check if the given indentation level is valid based on current context
610 #[allow(clippy::missing_const_for_fn)] // Cannot be const due to self.detected_indent_style access
611 fn is_valid_indentation_level(&self, indent: usize) -> bool {
612 // For now, allow any indentation that could represent valid nesting
613 // In the future, this could be made more strict by checking against
614 // the current indent_stack to ensure proper nesting
615 if let Some(crate::value::IndentStyle::Spaces(width)) = self.detected_indent_style {
616 // Must be a multiple of the detected width
617 indent % width == 0
618 } else {
619 // If no style detected yet, allow any indentation
620 true
621 }
622 }
623
624 /// Validate that current indentation is consistent with detected style
625 fn validate_indentation_consistency(&self, current_indent: usize) -> Result<()> {
626 if let Some(crate::value::IndentStyle::Spaces(width)) = self.detected_indent_style {
627 // Check if current indentation is a multiple of the detected width
628 if current_indent > 0 && current_indent % width != 0 {
629 let lower_level = (current_indent / width) * width;
630 let higher_level = lower_level + width;
631 let suggestion = format!(
632 "Expected indentation to be a multiple of {} spaces. Use {} or {} spaces instead of {}",
633 width, lower_level, higher_level, current_indent
634 );
635 let context =
636 crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
637 .with_suggestion(suggestion);
638 return Err(Error::indentation_with_context(
639 self.position,
640 (current_indent / width) * width, // expected (nearest valid level)
641 current_indent, // found
642 context,
643 ));
644 }
645 }
646 Ok(())
647 }
648
649 /// Check if current position starts a plain scalar
650 fn is_plain_scalar_start(&self) -> bool {
651 self.current_char.map_or(false, |ch| match ch {
652 // Pure indicators — never start a plain scalar.
653 ',' | '[' | ']' | '{' | '}' | '#' | '&' | '*' | '!' | '|' | '>' | '\'' | '"' | '%'
654 | '@' | '`' => false,
655 // YAML 1.2 §7.3.3: `?`, `:`, `-` may start a plain scalar when
656 // the next character is non-whitespace (and, in flow context,
657 // not a flow indicator). Otherwise they act as indicators
658 // (complex-key marker / value separator / block-entry marker).
659 '?' | ':' | '-' => match self.peek_char(1) {
660 None => false,
661 Some(c) if c.is_whitespace() => false,
662 Some(c) if self.flow_level > 0 && ",[]{}".contains(c) => false,
663 Some(_) => true,
664 },
665 _ => !ch.is_whitespace(),
666 })
667 }
668
669 /// Check if the value is a YAML boolean
670 fn is_yaml_bool(value: &str) -> bool {
671 matches!(
672 value,
673 "true"
674 | "false"
675 | "True"
676 | "False"
677 | "TRUE"
678 | "FALSE"
679 | "yes"
680 | "no"
681 | "Yes"
682 | "No"
683 | "YES"
684 | "NO"
685 | "on"
686 | "off"
687 | "On"
688 | "Off"
689 | "ON"
690 | "OFF"
691 )
692 }
693
694 /// Check if the value is a YAML null
695 fn is_yaml_null(value: &str) -> bool {
696 matches!(value, "null" | "Null" | "NULL" | "~" | "")
697 }
698
699 /// Normalize a scalar value based on YAML rules.
700 ///
701 /// The scanner preserves the original text of plain scalars. Type
702 /// resolution (including version-aware bool/null mapping) happens in
703 /// the composer (see `crate::resolver::resolve_plain_scalar`). This
704 /// preserves enough information for the composer to apply the
705 /// YAML 1.1 vs 1.2 distinction and for round-trip emitters to
706 /// recover the original spelling.
707 fn normalize_scalar(value: String) -> String {
708 value
709 }
710
711 /// Scan a number token
712 fn scan_number(&mut self) -> Result<Token> {
713 let start_pos = self.position;
714 let mut value = String::new();
715
716 // Handle negative numbers
717 if self.current_char == Some('-') {
718 value.push('-');
719 self.advance();
720 }
721
722 // Scan digits
723 while let Some(ch) = self.current_char {
724 if ch.is_ascii_digit() {
725 value.push(ch);
726 self.advance();
727 } else if ch == '.' {
728 value.push(ch);
729 self.advance();
730 // Scan fractional part
731 while let Some(ch) = self.current_char {
732 if ch.is_ascii_digit() {
733 value.push(ch);
734 self.advance();
735 } else {
736 break;
737 }
738 }
739 break;
740 } else {
741 break;
742 }
743 }
744
745 Ok(Token::new(
746 TokenType::Scalar(value, tokens::QuoteStyle::Plain),
747 start_pos,
748 self.position,
749 ))
750 }
751
752 /// Scan a plain scalar (unquoted string)
753 fn scan_plain_scalar(&mut self) -> Result<Token> {
754 let start_pos = self.position;
755 let start_col = start_pos.column;
756 let mut value = String::new();
757 let mut multi_line = false;
758
759 loop {
760 // Scan content on the current line until we hit a stop condition.
761 while let Some(ch) = self.current_char {
762 if self.flow_level == 0 {
763 match ch {
764 '\n' | '\r' => break,
765 ':' if self.peek_char(1).map_or(true, |c| c.is_whitespace()) => break,
766 '#' if value.is_empty()
767 || self.peek_char(-1).map_or(false, |c| c.is_whitespace()) =>
768 {
769 break;
770 }
771 _ => {}
772 }
773 } else {
774 match ch {
775 // Same line-break handling as block context: stop
776 // collecting raw content at `\n`/`\r`, then let the
777 // outer fold logic decide whether the next line
778 // continues this scalar (yaml-test-suite 8KB6,
779 // 8UDB, 9BXH).
780 '\n' | '\r' => break,
781 ',' | '[' | ']' | '{' | '}' => break,
782 // In flow context, `:` is a key-value separator
783 // when followed by whitespace OR any flow indicator
784 // (`,`, `[`, `]`, `{`, `}`). Tracked by yaml-test-
785 // suite FRK4 (`{ ? foo :, ... }`).
786 ':' if self
787 .peek_char(1)
788 .map_or(true, |c| c.is_whitespace() || ",[]{}".contains(c)) =>
789 {
790 break;
791 }
792 '#' if value.is_empty()
793 || self.peek_char(-1).map_or(false, |c| c.is_whitespace()) =>
794 {
795 break;
796 }
797 _ => {}
798 }
799 }
800 value.push(ch);
801 self.advance();
802 }
803
804 // If we didn't stop at a newline, this scalar is complete.
805 if !matches!(self.current_char, Some('\n' | '\r')) {
806 break;
807 }
808
809 // Per §6.5 line folding, trailing whitespace on the line is
810 // dropped (it gets replaced by the fold separator that the
811 // next continuation block emits).
812 while matches!(value.chars().last(), Some(' ' | '\t')) {
813 value.pop();
814 }
815
816 // YAML 1.2 §6.5 / §7.3.3: try to fold continuation lines into
817 // the same plain scalar. A continuation line must be:
818 // * indented strictly more than the scalar's start column,
819 // * not a document marker (`---` / `...`),
820 // * not a comment-only line,
821 // * not empty-with-EOF.
822 // Save state for backtracking if continuation isn't allowed.
823 let saved_position = self.position;
824 let saved_index = self.current_char_index;
825 let saved_char = self.current_char;
826
827 // Count physical newlines we skip; whitespace within the lines
828 // is also consumed.
829 let mut newlines = 0usize;
830 loop {
831 match self.current_char {
832 Some('\n') => {
833 newlines += 1;
834 self.advance();
835 }
836 Some('\r') => {
837 self.advance();
838 }
839 Some(' ' | '\t') => {
840 self.advance();
841 }
842 _ => break,
843 }
844 }
845
846 let next_col = self.position.column;
847 let next_ch = self.current_char;
848 let is_doc_marker = matches!(next_ch, Some('-' | '.'))
849 && self.peek_char(1) == next_ch
850 && self.peek_char(2) == next_ch
851 && self.peek_char(3).map_or(true, |c| c.is_whitespace());
852
853 // Continuation column rule:
854 // * Flow context: no column rule, only flow indicators
855 // terminate (8KB6, 8UDB, 9BXH).
856 // * Block context: must be strictly deeper than the parent
857 // block's key column. The parent indent is the max of
858 // `indent_stack.last()` (block mapping/sequence indent)
859 // and `compact_sequence_indents.last()` — the latter
860 // tracks sequences opened compactly (e.g. `? - x` where
861 // the dash didn't push to indent_stack). Without the
862 // compact-stack check, `? - Detroit Tigers\n - Chicago`
863 // would fold both lines into one scalar (yaml-test-
864 // suite M5DY).
865 // Fall back to `next_col >= start_col` for top-level
866 // scalars where there's no enclosing block.
867 let column_ok = if self.flow_level > 0 {
868 true
869 } else {
870 let block_indent = self.indent_stack.last().copied().unwrap_or(0);
871 let compact_indent = self.compact_sequence_indents.last().copied().unwrap_or(0);
872 let parent_indent = block_indent.max(compact_indent);
873 next_col >= parent_indent + 2 || next_col >= start_col
874 };
875 let can_continue = next_ch.is_some()
876 && !matches!(next_ch, Some('\n' | '\r' | '#'))
877 && column_ok
878 && !is_doc_marker
879 && !(self.flow_level > 0 && matches!(next_ch, Some(',' | ']' | '}')));
880
881 if !can_continue {
882 self.position = saved_position;
883 self.current_char_index = saved_index;
884 self.current_char = saved_char;
885 break;
886 }
887
888 // Append fold separator: single newline → space; N>1 newlines
889 // collapse to N-1 retained newlines (YAML §6.5 line folding).
890 if newlines <= 1 {
891 value.push(' ');
892 } else {
893 for _ in 0..(newlines - 1) {
894 value.push('\n');
895 }
896 }
897 multi_line = true;
898 }
899
900 // YAML 1.2 §8.1.3: implicit keys must be on a single line. If the
901 // plain scalar folded across line breaks AND the next non-
902 // whitespace char is `:` (key-value separator), it's about to be
903 // used as an implicit key — reject (yaml-test-suite G7JE).
904 if multi_line && self.flow_level == 0 {
905 let mut off = 0isize;
906 while matches!(self.peek_char(off), Some(' ' | '\t')) {
907 off += 1;
908 }
909 if self.peek_char(off) == Some(':') {
910 return Err(Error::scan(
911 self.position,
912 "Multi-line plain scalar may not be used as an implicit key".to_string(),
913 ));
914 }
915 }
916
917 self.resource_tracker
918 .check_string_length(&self.limits, value.len())?;
919
920 let value = value.trim_end().to_string();
921 let normalized_value = Self::normalize_scalar(value);
922
923 Ok(Token::new(
924 TokenType::Scalar(normalized_value, tokens::QuoteStyle::Plain),
925 start_pos,
926 self.position,
927 ))
928 }
929
930 /// Scan a quoted string
931 fn scan_quoted_string(&mut self, quote_char: char) -> Result<Token> {
932 let start_pos = self.position;
933 let mut value = String::new();
934
935 // Determine quote style based on quote character
936 let quote_style = match quote_char {
937 '\'' => tokens::QuoteStyle::Single,
938 '"' => tokens::QuoteStyle::Double,
939 _ => tokens::QuoteStyle::Plain,
940 };
941
942 self.advance(); // Skip opening quote
943 let mut closed = false;
944 let mut multi_line = false;
945 // High-water mark of bytes contributed by escape sequences. The
946 // trailing-whitespace strip at fold time must not pop past it,
947 // because an escape-produced \t / space is literal content
948 // (yaml-test-suite DE56/00, DE56/01).
949 let mut escape_end: usize = 0;
950
951 while let Some(ch) = self.current_char {
952 if ch == quote_char {
953 // YAML 1.2 §7.3.2 (Single-Quoted): `''` is the only escape,
954 // collapsing to a single `'`. Detect that here BEFORE
955 // treating the quote as the closing delimiter.
956 if quote_char == '\'' && self.peek_char(1) == Some('\'') {
957 value.push('\'');
958 self.advance();
959 self.advance();
960 continue;
961 }
962 self.advance(); // Skip closing quote
963 closed = true;
964 break;
965 } else if ch == '\\' && quote_char == '"' {
966 self.advance();
967 if let Some(escaped) = self.current_char {
968 match escaped {
969 // YAML 1.2 §5.7 double-quoted escape allowlist.
970 'n' => value.push('\n'),
971 't' => value.push('\t'),
972 'r' => value.push('\r'),
973 '\\' => value.push('\\'),
974 '"' => value.push('"'),
975 '0' => value.push('\0'),
976 'a' => value.push('\x07'),
977 'b' => value.push('\x08'),
978 'f' => value.push('\x0C'),
979 'v' => value.push('\x0B'),
980 'e' => value.push('\x1B'),
981 ' ' => value.push(' '),
982 '/' => value.push('/'),
983 'N' => value.push('\u{0085}'),
984 '_' => value.push('\u{00A0}'),
985 'L' => value.push('\u{2028}'),
986 'P' => value.push('\u{2029}'),
987 '\n' => {
988 // Escaped line break (§7.3.2): the newline is
989 // dropped AND leading whitespace on the next
990 // line is excluded from the content.
991 self.advance();
992 while matches!(self.current_char, Some(' ' | '\t')) {
993 self.advance();
994 }
995 continue;
996 }
997 '\t' => value.push('\t'), // literal tab after `\` → tab (yaml-test-suite 3RLN/DE56)
998 // Hex / Unicode escapes per YAML 1.2 §5.7:
999 // \xNN — 2 hex digits, codepoint ≤ 0xFF
1000 // \uNNNN — 4 hex digits, codepoint ≤ 0xFFFF
1001 // \UNNNNNNNN — 8 hex digits, full Unicode codepoint
1002 'x' | 'u' | 'U' => {
1003 let n = match escaped {
1004 'x' => 2,
1005 'u' => 4,
1006 _ => 8,
1007 };
1008 self.advance(); // consume the x/u/U
1009 let mut codepoint: u32 = 0;
1010 for _ in 0..n {
1011 let c = self.current_char.ok_or_else(|| {
1012 Error::scan(
1013 self.position,
1014 format!("Truncated \\{escaped} escape"),
1015 )
1016 })?;
1017 let d = c.to_digit(16).ok_or_else(|| {
1018 Error::scan(
1019 self.position,
1020 format!("Invalid hex digit `{c}` in \\{escaped} escape"),
1021 )
1022 })?;
1023 codepoint = (codepoint << 4) | d;
1024 self.advance();
1025 }
1026 let ch = char::from_u32(codepoint).ok_or_else(|| {
1027 Error::scan(
1028 self.position,
1029 format!("Invalid Unicode codepoint U+{codepoint:X}"),
1030 )
1031 })?;
1032 value.push(ch);
1033 escape_end = value.len();
1034 continue; // already advanced past hex digits
1035 }
1036 // Everything else is invalid per spec.
1037 _ => {
1038 return Err(Error::scan(
1039 self.position,
1040 format!("Invalid escape sequence: \\{escaped}"),
1041 ));
1042 }
1043 }
1044 escape_end = value.len();
1045 self.advance();
1046 }
1047 } else if ch == '\\' {
1048 // Single-quoted strings have no backslash escapes — `\` is
1049 // a literal character. (Single-quote escape is `''`.)
1050 value.push(ch);
1051 self.advance();
1052 } else if ch == '\n' || ch == '\r' {
1053 // YAML 1.2 §7.3.2 (double-quoted) / §7.3.3 (single-quoted)
1054 // line folding: a single newline within a quoted scalar
1055 // folds to a space; N>1 consecutive newlines retain N-1;
1056 // leading whitespace on the continuation line is excluded.
1057 let mut newlines = 0usize;
1058 // §6.1: tabs cannot be indentation. A continuation
1059 // line that BEGINS with a tab (no leading spaces) in
1060 // an enclosing block context is invalid (yaml-test-
1061 // suite DK95/01). Tabs that appear AFTER spaces in
1062 // the same indent area are content, not indentation.
1063 let mut just_after_newline = false;
1064 while let Some(c) = self.current_char {
1065 match c {
1066 '\n' => {
1067 newlines += 1;
1068 multi_line = true;
1069 self.advance();
1070 just_after_newline = true;
1071 }
1072 '\r' => {
1073 self.advance();
1074 }
1075 ' ' => {
1076 self.advance();
1077 just_after_newline = false;
1078 }
1079 '\t' if just_after_newline
1080 && self.flow_level == 0
1081 && (self.indent_stack.len() > 1
1082 || !self.compact_sequence_indents.is_empty()) =>
1083 {
1084 return Err(Error::scan(
1085 self.position,
1086 "Tab cannot serve as indentation of quoted scalar continuation"
1087 .to_string(),
1088 ));
1089 }
1090 '\t' => {
1091 self.advance();
1092 }
1093 _ => break,
1094 }
1095 }
1096 // §8.1.4: a multi-line quoted scalar inside a block
1097 // context must indent each continuation more than the
1098 // enclosing block. \`quoted: "a\\nb"\` with \`b\` at col 1
1099 // violates the rule because \`quoted:\` sits at indent 0
1100 // (yaml-test-suite QB6E). Only fires when there IS an
1101 // enclosing block (indent_stack > [0] or compact-seq
1102 // active) — top-level quoted scalars with continuation
1103 // at col 1 are legal.
1104 if newlines > 0
1105 && self.flow_level == 0
1106 && (self.indent_stack.len() > 1 || !self.compact_sequence_indents.is_empty())
1107 && !matches!(self.current_char, None | Some('\n' | '\r'))
1108 {
1109 let parent_indent = self.indent_stack.last().copied().unwrap_or(0);
1110 let indent = self.position.column.saturating_sub(1);
1111 if indent <= parent_indent {
1112 return Err(Error::scan(
1113 self.position,
1114 "Quoted scalar continuation line is not indented enough".to_string(),
1115 ));
1116 }
1117 }
1118 // §6.8: a doc-start/end marker (`---` or `...`) at
1119 // column 1 always terminates the current document.
1120 // Encountering one inside an unterminated quoted
1121 // scalar is invalid — the quote escapes nothing past
1122 // the doc boundary (yaml-test-suite 5TRB, RXY3,
1123 // 9MQT/01).
1124 if self.position.column == 1 {
1125 let next3: String = self
1126 .char_cache
1127 .get(self.current_char_index..self.current_char_index + 3)
1128 .map(|s| s.iter().collect())
1129 .unwrap_or_default();
1130 if (next3 == "---" || next3 == "...")
1131 && self
1132 .char_cache
1133 .get(self.current_char_index + 3)
1134 .map_or(true, |c| c.is_whitespace())
1135 {
1136 return Err(Error::scan(
1137 self.position,
1138 format!(
1139 "Document {} marker `{}` inside quoted scalar",
1140 if next3 == "---" { "start" } else { "end" },
1141 next3
1142 ),
1143 ));
1144 }
1145 }
1146 // Drop trailing whitespace on the prior line (the bytes
1147 // we already pushed) before applying the fold. Don't
1148 // strip past `escape_end` — escape-produced whitespace
1149 // is literal content, not "trailing" line whitespace.
1150 while value.len() > escape_end && matches!(value.chars().last(), Some(' ' | '\t')) {
1151 value.pop();
1152 }
1153 if newlines <= 1 {
1154 value.push(' ');
1155 } else {
1156 for _ in 0..(newlines - 1) {
1157 value.push('\n');
1158 }
1159 }
1160 } else {
1161 value.push(ch);
1162 self.advance();
1163
1164 // Check string length periodically to fail fast
1165 if value.len() > self.limits.max_string_length {
1166 return Err(Error::limit_exceeded(format!(
1167 "String length {} exceeds maximum {}",
1168 value.len(),
1169 self.limits.max_string_length
1170 )));
1171 }
1172 }
1173 }
1174
1175 // Check string length limit
1176 if !closed {
1177 return Err(Error::scan(
1178 self.position,
1179 format!(
1180 "Unclosed {} quoted string",
1181 if quote_char == '"' {
1182 "double"
1183 } else {
1184 "single"
1185 }
1186 ),
1187 ));
1188 }
1189
1190 self.resource_tracker
1191 .check_string_length(&self.limits, value.len())?;
1192
1193 // YAML 1.2 §7.3.1 / §7.3.2: after the closing quote, the rest of
1194 // the line (or sub-expression in flow context) must be empty save
1195 // for a separator. Skip horizontal whitespace and look at the next
1196 // non-space char; if it's content rather than `,`/`:`/`}`/`]`/`#`/
1197 // newline/EOF, it's a trailing-content error (yaml-test-suite
1198 // Q4CL: `"quoted2" trailing content`).
1199 {
1200 let mut offset = 0isize;
1201 let mut saw_space = false;
1202 while matches!(self.peek_char(offset), Some(' ' | '\t')) {
1203 saw_space = true;
1204 offset += 1;
1205 }
1206 let next = self.peek_char(offset);
1207 // A `#` is a comment indicator ONLY when preceded by whitespace
1208 // (YAML 1.2 §6.6); `"value"#cmt` is invalid.
1209 let ok = match next {
1210 None => true,
1211 Some('#') => saw_space,
1212 Some(c) => matches!(c, ',' | ':' | '}' | ']' | '\n' | '\r'),
1213 };
1214 if !ok {
1215 return Err(Error::scan(
1216 self.position,
1217 format!("Unexpected `{}` after quoted scalar", next.unwrap_or(' ')),
1218 ));
1219 }
1220 // YAML 1.2 §8.1.3: implicit keys must be on a single line.
1221 // If the scalar folded across line breaks AND the next non-
1222 // whitespace char is `:` (key-value separator), the scalar
1223 // is being used as an implicit key — error.
1224 if multi_line && self.flow_level == 0 && next == Some(':') {
1225 return Err(Error::scan(
1226 self.position,
1227 "Multi-line quoted scalar may not be used as an implicit key".to_string(),
1228 ));
1229 }
1230 }
1231
1232 Ok(Token::new(
1233 TokenType::Scalar(value, quote_style),
1234 start_pos,
1235 self.position,
1236 ))
1237 }
1238
1239 /// Scan document start marker (---)
1240 fn scan_document_start(&mut self) -> Result<Option<Token>> {
1241 if self.current_char == Some('-')
1242 && self.peek_char(1) == Some('-')
1243 && self.peek_char(2) == Some('-')
1244 && self.peek_char(3).map_or(true, |c| c.is_whitespace())
1245 {
1246 // Doc markers are invalid inside flow collections.
1247 if self.flow_level > 0 {
1248 return Err(Error::scan(
1249 self.position,
1250 "`---` document-start marker is not allowed inside a flow collection"
1251 .to_string(),
1252 ));
1253 }
1254 let start_pos = self.position;
1255 self.advance(); // -
1256 self.advance(); // -
1257 self.advance(); // -
1258
1259 Ok(Some(Token::new(
1260 TokenType::DocumentStart,
1261 start_pos,
1262 self.position,
1263 )))
1264 } else {
1265 Ok(None)
1266 }
1267 }
1268
1269 /// Scan YAML version directive (%YAML)
1270 fn scan_yaml_directive(&mut self) -> Result<Option<Token>> {
1271 if self.current_char != Some('%') {
1272 return Ok(None);
1273 }
1274
1275 let start_pos = self.position;
1276 let saved_position = self.position;
1277 self.advance(); // Skip '%'
1278
1279 // Check for "YAML"
1280 if self.current_char == Some('Y')
1281 && self.peek_char(1) == Some('A')
1282 && self.peek_char(2) == Some('M')
1283 && self.peek_char(3) == Some('L')
1284 && self.peek_char(4).map_or(false, |c| c.is_whitespace())
1285 {
1286 self.advance(); // Y
1287 self.advance(); // A
1288 self.advance(); // M
1289 self.advance(); // L
1290
1291 // Skip whitespace
1292 self.skip_whitespace();
1293
1294 // Parse version number (e.g., "1.2")
1295 let major = if let Some(ch) = self.current_char {
1296 if ch.is_ascii_digit() {
1297 let digit = ch.to_digit(10).unwrap() as u8;
1298 self.advance();
1299 digit
1300 } else {
1301 return Err(Error::scan(
1302 self.position,
1303 "Expected major version number after %YAML".to_string(),
1304 ));
1305 }
1306 } else {
1307 return Err(Error::scan(
1308 self.position,
1309 "Expected version after %YAML directive".to_string(),
1310 ));
1311 };
1312
1313 // Expect '.'
1314 if self.current_char != Some('.') {
1315 return Err(Error::scan(
1316 self.position,
1317 "Expected '.' in YAML version".to_string(),
1318 ));
1319 }
1320 self.advance();
1321
1322 // Parse minor version
1323 let minor = if let Some(ch) = self.current_char {
1324 if ch.is_ascii_digit() {
1325 let digit = ch.to_digit(10).unwrap() as u8;
1326 self.advance();
1327 digit
1328 } else {
1329 return Err(Error::scan(
1330 self.position,
1331 "Expected minor version number after '.'".to_string(),
1332 ));
1333 }
1334 } else {
1335 return Err(Error::scan(
1336 self.position,
1337 "Expected minor version number".to_string(),
1338 ));
1339 };
1340
1341 // YAML 1.2 §6.8.1: the directive line must end after the
1342 // version (modulo whitespace and an optional comment). Extra
1343 // tokens (e.g. `%YAML 1.2 foo`) are invalid — yaml-test-suite
1344 // H7TQ. Also `%YAML 1.1#...` (yaml-test-suite MUS6/00) needs
1345 // whitespace before `#`.
1346 let mut saw_space = false;
1347 while matches!(self.current_char, Some(' ' | '\t')) {
1348 saw_space = true;
1349 self.advance();
1350 }
1351 match self.current_char {
1352 None | Some('\n' | '\r') => {}
1353 Some('#') if saw_space => {
1354 while let Some(ch) = self.current_char {
1355 if ch == '\n' || ch == '\r' {
1356 break;
1357 }
1358 self.advance();
1359 }
1360 }
1361 Some(c) => {
1362 return Err(Error::scan(
1363 self.position,
1364 format!("Unexpected `{c}` after %YAML directive"),
1365 ));
1366 }
1367 }
1368
1369 Ok(Some(Token::new(
1370 TokenType::YamlDirective(major, minor),
1371 start_pos,
1372 self.position,
1373 )))
1374 } else {
1375 // Not a YAML directive, reset position
1376 self.position = saved_position;
1377 // Properly reset current_char based on saved position
1378 self.current_char = self
1379 .char_indices
1380 .iter()
1381 .find(|(i, _)| *i == saved_position.index)
1382 .map(|(_, ch)| *ch);
1383 // Reset the current_char_index
1384 self.current_char_index = self
1385 .char_indices
1386 .iter()
1387 .position(|(i, _)| *i == saved_position.index)
1388 .unwrap_or(0);
1389 Ok(None)
1390 }
1391 }
1392
1393 /// Scan TAG directive (%TAG)
1394 fn scan_tag_directive(&mut self) -> Result<Option<Token>> {
1395 if self.current_char != Some('%') {
1396 return Ok(None);
1397 }
1398
1399 let start_pos = self.position;
1400 let saved_position = self.position;
1401 self.advance(); // Skip '%'
1402
1403 // Check for "TAG"
1404 if self.current_char == Some('T')
1405 && self.peek_char(1) == Some('A')
1406 && self.peek_char(2) == Some('G')
1407 && self.peek_char(3).map_or(false, |c| c.is_whitespace())
1408 {
1409 self.advance(); // T
1410 self.advance(); // A
1411 self.advance(); // G
1412
1413 // Skip whitespace
1414 self.skip_whitespace();
1415
1416 // Parse handle (e.g., "!" or "!!")
1417 let handle = self.scan_tag_handle()?;
1418
1419 // Skip whitespace
1420 self.skip_whitespace();
1421
1422 // Parse prefix (URI)
1423 let prefix = self.scan_tag_prefix()?;
1424
1425 Ok(Some(Token::new(
1426 TokenType::TagDirective(handle, prefix),
1427 start_pos,
1428 self.position,
1429 )))
1430 } else {
1431 // Reset position if not a TAG directive
1432 self.position = saved_position;
1433 // Properly reset current_char based on saved position
1434 self.current_char = self
1435 .char_indices
1436 .iter()
1437 .find(|(i, _)| *i == saved_position.index)
1438 .map(|(_, ch)| *ch);
1439 // Reset the current_char_index
1440 self.current_char_index = self
1441 .char_indices
1442 .iter()
1443 .position(|(i, _)| *i == saved_position.index)
1444 .unwrap_or(0);
1445 Ok(None)
1446 }
1447 }
1448
1449 /// Scan a tag handle for TAG directive
1450 fn scan_tag_handle(&mut self) -> Result<String> {
1451 let mut handle = String::new();
1452
1453 if self.current_char != Some('!') {
1454 return Err(Error::scan(
1455 self.position,
1456 "Expected '!' at start of tag handle".to_string(),
1457 ));
1458 }
1459
1460 handle.push('!');
1461 self.advance();
1462
1463 // Handle can be "!" or "!!" or "!name!"
1464 if self.current_char == Some('!') {
1465 // Secondary handle "!!"
1466 handle.push('!');
1467 self.advance();
1468 } else if self.current_char.map_or(false, |c| c.is_alphanumeric()) {
1469 // Named handle like "!name!"
1470 while let Some(ch) = self.current_char {
1471 if ch.is_alphanumeric() || ch == '-' || ch == '_' {
1472 handle.push(ch);
1473 self.advance();
1474 } else if ch == '!' {
1475 handle.push(ch);
1476 self.advance();
1477 break;
1478 } else {
1479 break;
1480 }
1481 }
1482 }
1483 // else just "!" primary handle
1484
1485 Ok(handle)
1486 }
1487
1488 /// Scan a tag prefix (URI) for TAG directive
1489 fn scan_tag_prefix(&mut self) -> Result<String> {
1490 let mut prefix = String::new();
1491
1492 // Read until end of line or comment
1493 while let Some(ch) = self.current_char {
1494 if ch == '\n' || ch == '\r' || ch == '#' {
1495 break;
1496 }
1497 if ch.is_whitespace() && prefix.is_empty() {
1498 self.advance();
1499 continue;
1500 }
1501 if ch.is_whitespace() && !prefix.is_empty() {
1502 // Trailing whitespace, we're done
1503 break;
1504 }
1505 prefix.push(ch);
1506 self.advance();
1507 }
1508
1509 if prefix.is_empty() {
1510 return Err(Error::scan(
1511 self.position,
1512 "Expected tag prefix after tag handle".to_string(),
1513 ));
1514 }
1515
1516 Ok(prefix.trim().to_string())
1517 }
1518
1519 /// Check if current position might be a directive
1520 fn is_directive(&self) -> bool {
1521 self.current_char == Some('%') && self.position.column == 1
1522 }
1523
1524 /// Scan document end marker (...)
1525 fn scan_document_end(&mut self) -> Result<Option<Token>> {
1526 if self.current_char == Some('.')
1527 && self.peek_char(1) == Some('.')
1528 && self.peek_char(2) == Some('.')
1529 && self.peek_char(3).map_or(true, |c| c.is_whitespace())
1530 {
1531 // Doc markers are invalid inside flow collections.
1532 if self.flow_level > 0 {
1533 return Err(Error::scan(
1534 self.position,
1535 "`...` document-end marker is not allowed inside a flow collection".to_string(),
1536 ));
1537 }
1538 let start_pos = self.position;
1539 self.advance(); // .
1540 self.advance(); // .
1541 self.advance(); // .
1542
1543 // YAML 1.2 §6.4: `...` must be followed only by whitespace or
1544 // end-of-line (comments allowed). Inline content after `...`
1545 // is invalid (yaml-test-suite 3HFZ).
1546 while let Some(ch) = self.current_char {
1547 match ch {
1548 ' ' | '\t' => {
1549 self.advance();
1550 }
1551 '\n' | '\r' | '#' => break,
1552 _ => {
1553 return Err(Error::scan(
1554 self.position,
1555 "Content after `...` document-end marker is invalid".to_string(),
1556 ));
1557 }
1558 }
1559 }
1560
1561 Ok(Some(Token::new(
1562 TokenType::DocumentEnd,
1563 start_pos,
1564 self.position,
1565 )))
1566 } else {
1567 Ok(None)
1568 }
1569 }
1570
1571 /// Scan a comment token
1572 fn scan_comment(&mut self) -> Result<Token> {
1573 let start_pos = self.position;
1574 let mut comment_text = String::new();
1575
1576 // Skip the '#' character
1577 if self.current_char == Some('#') {
1578 self.advance();
1579 }
1580
1581 // Collect the comment text
1582 while let Some(ch) = self.current_char {
1583 if ch == '\n' || ch == '\r' {
1584 break;
1585 }
1586 comment_text.push(ch);
1587 self.advance();
1588 }
1589
1590 // Trim leading whitespace from comment text
1591 let comment_text = comment_text.trim_start().to_string();
1592
1593 Ok(Token::new(
1594 TokenType::Comment(comment_text),
1595 start_pos,
1596 self.position,
1597 ))
1598 }
1599
1600 /// Process a line and generate appropriate tokens
1601 #[allow(clippy::cognitive_complexity)]
1602 fn process_line(&mut self) -> Result<()> {
1603 // Check for directives at start of line
1604 if self.position.column == 1 && self.current_char == Some('%') {
1605 // Try to scan YAML directive
1606 if let Some(token) = self.scan_yaml_directive()? {
1607 self.tokens.push(token);
1608 return Ok(());
1609 }
1610
1611 // Try to scan TAG directive
1612 if let Some(token) = self.scan_tag_directive()? {
1613 self.tokens.push(token);
1614 return Ok(());
1615 }
1616
1617 // YAML 1.2 §6.8.4: a YAML processor MUST ignore directives it
1618 // does not recognize. Skip the line silently — parsing continues
1619 // with whatever follows on the next line.
1620 if self.current_char == Some('%') {
1621 while let Some(ch) = self.current_char {
1622 if ch == '\n' || ch == '\r' {
1623 break;
1624 }
1625 self.advance();
1626 }
1627 return Ok(());
1628 }
1629 }
1630
1631 // Check for document markers at start of line
1632 if self.position.column == 1 {
1633 // Check for document start marker
1634 if let Some(token) = self.scan_document_start()? {
1635 self.tokens.push(token);
1636 return Ok(());
1637 }
1638
1639 // Check for document end marker
1640 if let Some(token) = self.scan_document_end()? {
1641 self.tokens.push(token);
1642 return Ok(());
1643 }
1644 }
1645
1646 // Handle indentation at start of line
1647 if self.position.column == 1 {
1648 self.handle_indentation()?;
1649 }
1650
1651 // Skip empty lines and comments
1652 self.skip_whitespace();
1653
1654 match self.current_char {
1655 None => return Ok(()),
1656 Some('#') => {
1657 if self.preserve_comments {
1658 // Create a comment token
1659 let comment_token = self.scan_comment()?;
1660 self.tokens.push(comment_token);
1661 } else {
1662 // Skip comment lines
1663 while let Some(ch) = self.current_char {
1664 if ch == '\n' || ch == '\r' {
1665 break;
1666 }
1667 self.advance();
1668 }
1669 }
1670 return Ok(());
1671 }
1672 Some('\n' | '\r') => {
1673 self.advance();
1674 return Ok(());
1675 }
1676 _ => {}
1677 }
1678
1679 // Process tokens on this line
1680 while let Some(ch) = self.current_char {
1681 match ch {
1682 '\n' | '\r' => break,
1683 ' ' | '\t' => {
1684 self.skip_whitespace();
1685 }
1686 '#' => {
1687 // YAML 1.2 §6.6: a comment must be preceded by whitespace
1688 // OR be at the start of a line. Inputs like `,#invalid`
1689 // (yaml-test-suite CVW2) are not valid comments.
1690 let prev = self.peek_char(-1);
1691 let at_line_start = self.position.column == 1;
1692 let preceded_by_space = prev.map_or(true, |c| c.is_whitespace());
1693 if !at_line_start && !preceded_by_space {
1694 return Err(Error::scan(
1695 self.position,
1696 "Comment `#` must be preceded by whitespace".to_string(),
1697 ));
1698 }
1699 if self.preserve_comments {
1700 let comment_token = self.scan_comment()?;
1701 self.tokens.push(comment_token);
1702 } else {
1703 while let Some(ch) = self.current_char {
1704 if ch == '\n' || ch == '\r' {
1705 break;
1706 }
1707 self.advance();
1708 }
1709 }
1710 break;
1711 }
1712
1713 // Flow indicators. §7.4 allows a flow collection as
1714 // the implicit key of a block mapping (`[a]: b`,
1715 // `{x: y}: z`). When the flow-open is at line-start
1716 // (block context) and a `:` follows on the same line,
1717 // open the wrapping block mapping at the column of the
1718 // flow-open token, just as we do for line-start
1719 // properties (yaml-test-suite LX3P, 4FJ6, M2N8/01).
1720 '[' => {
1721 if self.flow_level == 0
1722 && self.position.column == self.current_indent + 1
1723 && self.check_for_mapping_ahead()
1724 {
1725 self.maybe_open_block_mapping_for_key()?;
1726 }
1727 let pos = self.position;
1728 self.advance();
1729 self.flow_level += 1;
1730 // Check depth limit
1731 self.resource_tracker
1732 .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
1733 self.tokens
1734 .push(Token::new(TokenType::FlowSequenceStart, pos, self.position));
1735 }
1736 ']' => {
1737 // YAML 1.2 §7.4: `]` is only valid inside an open
1738 // flow sequence. Stray `]` is a syntax error
1739 // (yaml-test-suite 4H7K).
1740 if self.flow_level == 0 {
1741 let context = ErrorContext::from_input(&self.input, &self.position, 2)
1742 .with_suggestion(
1743 "Remove the extra `]` or open a flow sequence with `[` first"
1744 .to_string(),
1745 );
1746 return Err(Error::scan_with_context(
1747 self.position,
1748 "Unexpected `]` outside flow context",
1749 context,
1750 ));
1751 }
1752 let pos = self.position;
1753 self.advance();
1754 self.flow_level -= 1;
1755 self.tokens
1756 .push(Token::new(TokenType::FlowSequenceEnd, pos, self.position));
1757 }
1758 '{' => {
1759 if self.flow_level == 0
1760 && self.position.column == self.current_indent + 1
1761 && self.check_for_mapping_ahead()
1762 {
1763 self.maybe_open_block_mapping_for_key()?;
1764 }
1765 let pos = self.position;
1766 self.advance();
1767 self.flow_level += 1;
1768 // Check depth limit
1769 self.resource_tracker
1770 .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
1771 self.tokens
1772 .push(Token::new(TokenType::FlowMappingStart, pos, self.position));
1773 }
1774 '}' => {
1775 if self.flow_level == 0 {
1776 let context = ErrorContext::from_input(&self.input, &self.position, 2)
1777 .with_suggestion(
1778 "Remove the extra `}` or open a flow mapping with `{` first"
1779 .to_string(),
1780 );
1781 return Err(Error::scan_with_context(
1782 self.position,
1783 "Unexpected `}` outside flow context",
1784 context,
1785 ));
1786 }
1787 let pos = self.position;
1788 self.advance();
1789 self.flow_level -= 1;
1790 self.tokens
1791 .push(Token::new(TokenType::FlowMappingEnd, pos, self.position));
1792 }
1793 ',' => {
1794 // §7.4: \`,\` is a flow indicator. Outside flow
1795 // context it's not meaningful as a structural
1796 // separator (yaml-test-suite U99R: \`- !!str, xxx\`
1797 // — the comma after a tag in block context is
1798 // invalid).
1799 if self.flow_level == 0 {
1800 return Err(Error::scan(
1801 self.position,
1802 "Unexpected `,` outside flow context".to_string(),
1803 ));
1804 }
1805 let pos = self.position;
1806 self.advance();
1807 self.tokens
1808 .push(Token::new(TokenType::FlowEntry, pos, self.position));
1809 }
1810
1811 // Key-value separator. YAML 1.2 §7.3.3 / §7.4:
1812 // * Block context: `:` separates key from value only when
1813 // followed by whitespace / EOF — otherwise it's part of
1814 // a plain scalar (e.g. `:foo`, `URL://path`).
1815 // * Flow context: same, plus `:` may be adjacent to a
1816 // value when the previous token completed a key node
1817 // (quoted/plain scalar, alias, or closed flow
1818 // collection) — see yaml-test-suite 5MUD, 5T43.
1819 ':' if self.peek_char(1).map_or(true, |c| {
1820 c.is_whitespace() || (self.flow_level > 0 && ",[]{}".contains(c))
1821 }) || (self.flow_level > 0
1822 && matches!(
1823 self.tokens.last().map(|t| &t.token_type),
1824 Some(
1825 TokenType::Scalar(_, _)
1826 | TokenType::Alias(_)
1827 | TokenType::FlowMappingEnd
1828 | TokenType::FlowSequenceEnd
1829 )
1830 )) =>
1831 {
1832 // §6.2: a \`:\` at line-start (the explicit-value
1833 // counterpart of an explicit \`?\` key) must be
1834 // followed by a SPACE — a tab as separator is
1835 // invalid (yaml-test-suite Y79Y/007, /009).
1836 if self.flow_level == 0
1837 && self.position.column == self.current_indent + 1
1838 && self.peek_char(1) == Some('\t')
1839 {
1840 return Err(Error::scan(
1841 self.position,
1842 "Tab cannot follow line-start `:` as explicit-value separator"
1843 .to_string(),
1844 ));
1845 }
1846 // §8.22: an implicit key in block context must fit
1847 // on a single line. If the previous token is a
1848 // flow-collection close whose matching open is on
1849 // a different line, the flow node spans multiple
1850 // lines and can't serve as the key (yaml-test-
1851 // suite C2SP \`[23\\n]: 42\`).
1852 if self.flow_level == 0 {
1853 let mut is_flow_close = false;
1854 let mut close_end_line = 0;
1855 if let Some(last) = self.tokens.last() {
1856 if matches!(
1857 last.token_type,
1858 TokenType::FlowSequenceEnd | TokenType::FlowMappingEnd
1859 ) {
1860 is_flow_close = true;
1861 close_end_line = last.end_position.line;
1862 }
1863 }
1864 if is_flow_close {
1865 let mut depth = 0i32;
1866 let mut open_idx: Option<usize> = None;
1867 for (idx, t) in self.tokens.iter().enumerate().rev() {
1868 match &t.token_type {
1869 TokenType::FlowSequenceEnd | TokenType::FlowMappingEnd => {
1870 depth += 1;
1871 }
1872 TokenType::FlowSequenceStart | TokenType::FlowMappingStart => {
1873 depth -= 1;
1874 if depth == 0 {
1875 open_idx = Some(idx);
1876 break;
1877 }
1878 }
1879 _ => {}
1880 }
1881 }
1882 if let Some(oi) = open_idx {
1883 let open_line = self.tokens[oi].start_position.line;
1884 // If a `?` (Key) token precedes the
1885 // matching flow open on the same line
1886 // as the key, the key is explicit and
1887 // may span lines (yaml-test-suite M5DY
1888 // \`? [ ...spans... ]: [ ... ]\`).
1889 let key_marker_before = self.tokens[..oi].iter().rev().any(|t| {
1890 matches!(t.token_type, TokenType::Key)
1891 && t.start_position.line == open_line
1892 });
1893 if !key_marker_before && open_line != close_end_line {
1894 return Err(Error::scan(
1895 self.position,
1896 "Implicit key in block context: flow collection key spans multiple lines"
1897 .to_string(),
1898 ));
1899 }
1900 }
1901 }
1902 }
1903 let pos = self.position;
1904 self.advance();
1905 self.tokens
1906 .push(Token::new(TokenType::Value, pos, self.position));
1907 }
1908
1909 // §6.2: the explicit-key marker \`?\` must be followed
1910 // by a SPACE (or EOL), not a tab. Tab as separator
1911 // after \`?\` is invalid (yaml-test-suite Y79Y/006, /008).
1912 '?' if self.flow_level == 0 && self.peek_char(1) == Some('\t') => {
1913 return Err(Error::scan(
1914 self.position,
1915 "Tab cannot follow `?` as block-key separator".to_string(),
1916 ));
1917 }
1918
1919 // Explicit key marker. An indented `?` at line-start
1920 // (e.g. `mapping:\\n ? key`) opens an implicit block
1921 // mapping at this column — same as a line-start scalar
1922 // key. Without this, scan_plain_scalar wouldn't see
1923 // the inner mapping's indent and would wrongly fold
1924 // the key content into a multi-line scalar
1925 // (yaml-test-suite S9E8, KK5P).
1926 '?' if self.flow_level == 0
1927 && (self.peek_char(1).map_or(true, |c| c.is_whitespace())
1928 || self.peek_char(1).is_none()) =>
1929 {
1930 if self.position.column == self.current_indent + 1 {
1931 self.maybe_open_block_mapping_for_key()?;
1932 }
1933 let pos = self.position;
1934 self.advance();
1935 self.tokens
1936 .push(Token::new(TokenType::Key, pos, self.position));
1937 }
1938 '?' if self.flow_level > 0
1939 && (self
1940 .peek_char(1)
1941 .map_or(true, |c| c.is_whitespace() || ",:]}".contains(c))
1942 || self.peek_char(1).is_none()) =>
1943 {
1944 let pos = self.position;
1945 self.advance();
1946 self.tokens
1947 .push(Token::new(TokenType::Key, pos, self.position));
1948 }
1949
1950 // Block entry
1951 '-' if self.flow_level == 0
1952 && (self.peek_char(1).map_or(true, |c| c.is_whitespace())
1953 || self.peek_char(1).is_none()) =>
1954 {
1955 // A block-entry \`-\` immediately after a flow
1956 // collection's close (\`}\`, \`]\`) ON THE SAME LINE
1957 // is invalid — no separator between the closed
1958 // flow node and the next sibling (yaml-test-suite
1959 // P2EQ \`- { y: z }- invalid\`). The same-line guard
1960 // is essential — a \`}\` on a previous line with a
1961 // new \`-\` on the next line is perfectly valid.
1962 //
1963 // Likewise, a block-entry \`-\` immediately after a
1964 // property (Anchor / Tag) on the same line is
1965 // invalid — the property must precede a node, and
1966 // a block sequence's first \`-\` must begin a line
1967 // (yaml-test-suite SY6V \`&anchor - x\`).
1968 if let Some(last) = self.tokens.last() {
1969 if matches!(
1970 last.token_type,
1971 TokenType::FlowMappingEnd | TokenType::FlowSequenceEnd
1972 ) && last.end_position.line == self.position.line
1973 {
1974 return Err(Error::scan(
1975 self.position,
1976 "Block-entry `-` immediately after flow collection close"
1977 .to_string(),
1978 ));
1979 }
1980 if matches!(last.token_type, TokenType::Anchor(_) | TokenType::Tag(_))
1981 && last.end_position.line == self.position.line
1982 {
1983 return Err(Error::scan(
1984 self.position,
1985 "Block-entry `-` cannot follow a property on the same line"
1986 .to_string(),
1987 ));
1988 }
1989 // §8.22: a block sequence's first \`-\` must
1990 // begin on a new line. \`key: - a\` (implicit
1991 // key, then dash on same line) is invalid
1992 // (yaml-test-suite 5U3A). But \`? key\\n: - x\`
1993 // (explicit value-separator on the same line
1994 // as the dash) IS valid: the \`?\` key sits
1995 // on a previous line. We distinguish by
1996 // walking back from the Value: if the
1997 // preceding non-property token is a Scalar
1998 // on the same line as the Value, the key
1999 // is implicit; otherwise it's after \`?\`.
2000 if matches!(last.token_type, TokenType::Value)
2001 && last.end_position.line == self.position.line
2002 {
2003 let value_line = last.start_position.line;
2004 let mut prior_scalar_line = None;
2005 for t in self.tokens.iter().rev().skip(1) {
2006 match &t.token_type {
2007 TokenType::Anchor(_) | TokenType::Tag(_) => {}
2008 TokenType::Scalar(..) => {
2009 prior_scalar_line = Some(t.end_position.line);
2010 break;
2011 }
2012 _ => break,
2013 }
2014 }
2015 if prior_scalar_line == Some(value_line) {
2016 return Err(Error::scan(
2017 self.position,
2018 "Block sequence value cannot start on the same line as its key"
2019 .to_string(),
2020 ));
2021 }
2022 }
2023 }
2024 let pos = self.position;
2025 self.advance();
2026
2027 // Check if we need to start a new block sequence.
2028 // `unwrap_or(0)` mirrors the pattern in
2029 // src/scanner/indentation.rs and is safer than
2030 // `.unwrap()` here: an error-recovery pop in another
2031 // path could otherwise leave the stack empty and
2032 // panic on crafted input (#18).
2033 let last_indent = self.indent_stack.last().copied().unwrap_or(0);
2034
2035 // If a compact sequence (opened from `? - x` or
2036 // similar) is already active at this dash's column,
2037 // the dash continues it — don't open a new nested
2038 // block sequence (yaml-test-suite M5DY).
2039 let dash_indent = pos.column.saturating_sub(1);
2040 let compact_active_here = self
2041 .compact_sequence_indents
2042 .last()
2043 .map_or(false, |&si| si == dash_indent);
2044 if compact_active_here {
2045 // Continuation of an existing compact sequence.
2046 } else if self.current_indent > last_indent {
2047 // Deeper indentation - start new nested sequence
2048 self.indent_stack.push(self.current_indent);
2049 self.indent_is_sequence.push(true);
2050 // Check depth limit
2051 self.resource_tracker
2052 .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
2053 self.tokens
2054 .push(Token::simple(TokenType::BlockSequenceStart, pos));
2055 } else if self.current_indent == last_indent
2056 && *self.indent_is_sequence.last().unwrap_or(&false)
2057 {
2058 // Same indent and the top of stack is already a sequence
2059 // → continuation of that sequence; no new start needed.
2060 } else if self.current_indent >= last_indent {
2061 // Same or root level — compact notation.
2062 // Start a new sequence only if we don't already have one
2063 // tracked at this exact indent.
2064 // For a dash that's *not* at line-start (e.g.
2065 // `? - x` where current_indent is still the
2066 // line's indent but the dash sits in mid-line),
2067 // use the dash column - 1 as the sequence's
2068 // indent so scan_plain_scalar's continuation
2069 // check correctly sees the deeper context
2070 // (yaml-test-suite M5DY).
2071 let dash_indent = pos.column.saturating_sub(1);
2072 let seq_indent = dash_indent.max(self.current_indent);
2073 let has_active_compact = self
2074 .compact_sequence_indents
2075 .last()
2076 .map_or(false, |&si| si == seq_indent);
2077
2078 if !has_active_compact {
2079 self.compact_sequence_indents.push(seq_indent);
2080 // Check depth limit
2081 self.resource_tracker.check_depth(
2082 &self.limits,
2083 self.flow_level + self.indent_stack.len(),
2084 )?;
2085 self.tokens
2086 .push(Token::simple(TokenType::BlockSequenceStart, pos));
2087 }
2088 }
2089
2090 self.tokens
2091 .push(Token::new(TokenType::BlockEntry, pos, self.position));
2092
2093 // After emitting BlockEntry, check if the next
2094 // token is another dash (nested sequence). §6.2
2095 // requires SPACE separation between dashes — a
2096 // tab between the outer and inner \`-\` is invalid
2097 // (yaml-test-suite Y79Y/004, /005). Track whether
2098 // a tab was consumed while skipping the inter-
2099 // dash whitespace and reject if so.
2100 let mut saw_tab_between = false;
2101 while let Some(c) = self.current_char {
2102 if c == ' ' {
2103 self.advance();
2104 } else if c == '\t' {
2105 saw_tab_between = true;
2106 self.advance();
2107 } else {
2108 break;
2109 }
2110 }
2111 if self.current_char == Some('-')
2112 && self.peek_char(1).map_or(true, |c| c.is_whitespace())
2113 && saw_tab_between
2114 {
2115 return Err(Error::scan(
2116 self.position,
2117 "Tab between block-entries on same line".to_string(),
2118 ));
2119 }
2120 if self.current_char == Some('-')
2121 && self.peek_char(1).map_or(true, |c| c.is_whitespace())
2122 {
2123 // We have a nested sequence on the same line!
2124 // Track this as an inline sequence
2125 self.inline_sequence_depth += 1;
2126 // Push the *indent* (column - 1), not the
2127 // column, so it matches the convention used by
2128 // maybe_open_block_mapping_for_key. With column
2129 // here the next-line indent (column - 1) would
2130 // be strictly less than the stored value and
2131 // wrongly trigger an early close, breaking
2132 // multi-line nested sequences (yaml-test-suite
2133 // 3ALJ, 57H4).
2134 self.indent_stack
2135 .push(self.position.column.saturating_sub(1));
2136 self.indent_is_sequence.push(true);
2137 // Check depth limit
2138 self.resource_tracker
2139 .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
2140 self.tokens
2141 .push(Token::simple(TokenType::BlockSequenceStart, self.position));
2142 // Continue processing - the next iteration will handle the nested dash
2143 } else if self.current_char.is_some()
2144 && !matches!(self.current_char, Some('\n' | '\r'))
2145 {
2146 // Content follows "- " on the same line.
2147 // Update current_indent to the content's column position so that
2148 // any mapping started here will be at a deeper indent level than
2149 // the sequence. This ensures handle_indentation properly closes
2150 // the mapping when the next sibling "- " appears.
2151 self.current_indent = self.position.column - 1;
2152 }
2153 }
2154
2155 // Quoted strings — same implicit-key mapping detection
2156 // as for plain scalars (yaml-test-suite 6H3V, 6SLA).
2157 '"' | '\'' => {
2158 if self.flow_level == 0 && self.check_for_mapping_ahead() {
2159 self.maybe_open_block_mapping_for_key()?;
2160 }
2161 let token = self.scan_quoted_string(ch)?;
2162 self.tokens.push(token);
2163 }
2164
2165 // Document markers (only if not a block entry).
2166 //
2167 // Reached only when `-` is at column = current_indent + 1 AND
2168 // the next character is non-whitespace — i.e. either the
2169 // `---` document-start marker OR a plain scalar starting
2170 // with `-` (e.g. `---word1`, `-foo`). If `scan_document_start`
2171 // declines, we MUST consume the run as a plain scalar — not
2172 // consulting `is_plain_scalar_start` here, because that helper
2173 // unconditionally rejects `-`, which would leave the outer
2174 // `while let` loop spinning on the same character.
2175 '-' if self.position.column == self.current_indent + 1
2176 && !self.peek_char(1).map_or(true, |c| c.is_whitespace()) =>
2177 {
2178 if let Some(token) = self.scan_document_start()? {
2179 self.tokens.push(token);
2180 } else {
2181 let token = self.scan_plain_scalar()?;
2182 self.tokens.push(token);
2183 }
2184 }
2185 '.' if self.position.column == self.current_indent + 1 => {
2186 if let Some(token) = self.scan_document_end()? {
2187 self.tokens.push(token);
2188 } else if self.is_plain_scalar_start() {
2189 let token = self.scan_plain_scalar()?;
2190 self.tokens.push(token);
2191 }
2192 }
2193
2194 // Numbers or plain scalars starting with -
2195 // Only scan as number if the entire token is numeric (no trailing letters)
2196 _ if (ch.is_ascii_digit()
2197 || (ch == '-' && self.peek_char(1).map_or(false, |c| c.is_ascii_digit())))
2198 && self.is_pure_number() =>
2199 {
2200 let token = self.scan_number()?;
2201 self.tokens.push(token);
2202 }
2203
2204 // Anchors and aliases. §6.9: a node's properties
2205 // (anchor/tag) are prefixes of the node. When an `&`,
2206 // `*`, or `!` is at the start of a line (column ==
2207 // current_indent + 1) and a `: ` follows on the same
2208 // line, the property/alias is part of an implicit
2209 // key's leading position. The block mapping that
2210 // contains this key therefore opens at this column,
2211 // *before* the property/alias token is emitted
2212 // (yaml-test-suite 7BMT, 6BFJ, 9KAX, U3XV, 26DV).
2213 '&' => {
2214 // Mirror H7J7 check for anchors (yaml-test-suite
2215 // G9HC \`seq:\\n&anchor\\n- a\`).
2216 if self.flow_level == 0
2217 && self.position.column == self.current_indent + 1
2218 && !self.check_for_mapping_ahead()
2219 && self.indent_stack.len() > 1
2220 && self.current_indent == self.indent_stack[self.indent_stack.len() - 2]
2221 && self.most_recent_token_is_value_separator()
2222 {
2223 return Err(Error::scan(
2224 self.position,
2225 "Anchor at line-start with insufficient indent for value position"
2226 .to_string(),
2227 ));
2228 }
2229 if self.flow_level == 0
2230 && self.position.column == self.current_indent + 1
2231 && self.check_for_mapping_ahead()
2232 {
2233 self.maybe_open_block_mapping_for_key()?;
2234 }
2235 let token = self.scan_anchor()?;
2236 self.tokens.push(token);
2237 }
2238 '*' => {
2239 // §6.9.2: alias/anchor names may contain \`:\` (only
2240 // flow indicators and whitespace terminate them).
2241 // So \`*a:\` is an alias named \`a:\`, NOT an alias
2242 // \`*a\` followed by a key separator. Don't open
2243 // an implicit block mapping in that case (yaml-
2244 // test-suite 2SXE).
2245 if self.flow_level == 0
2246 && self.position.column == self.current_indent + 1
2247 && self.check_for_mapping_ahead()
2248 && !self.colon_belongs_to_alias_anchor_name()
2249 {
2250 self.maybe_open_block_mapping_for_key()?;
2251 }
2252 let token = self.scan_alias()?;
2253 self.tokens.push(token);
2254 }
2255
2256 // Block scalars
2257 '|' => {
2258 let token = self.scan_literal_block_scalar()?;
2259 self.tokens.push(token);
2260 // Block scalar collection rewinds the cursor to the
2261 // start of the next under-indented line. `current_indent`
2262 // is still set to the inline content's column from the
2263 // enclosing `- |` / `key: |` site, so the next iteration
2264 // would mis-dispatch. Break out so the outer loop
2265 // re-enters `process_line` and reruns indent handling
2266 // (yaml-test-suite 4QFQ, M6YH, P2AD).
2267 break;
2268 }
2269 '>' => {
2270 let token = self.scan_folded_block_scalar()?;
2271 self.tokens.push(token);
2272 break;
2273 }
2274
2275 // Tags. Same line-start property-opens-mapping rule
2276 // (yaml-test-suite ZH7C variants).
2277 //
2278 // §6.9: a property at the SAME indent as the
2279 // enclosing mapping/sequence cannot apply to that
2280 // collection's value — the value must be more
2281 // indented. If we're at a line-start \`!\` whose column
2282 // equals the enclosing mapping's indent + 1 AND that
2283 // mapping currently has a key awaiting a value, the
2284 // tag is misplaced (yaml-test-suite H7J7).
2285 '!' => {
2286 if self.flow_level == 0
2287 && self.position.column == self.current_indent + 1
2288 && !self.check_for_mapping_ahead()
2289 && self.indent_stack.len() > 1
2290 && self.current_indent == self.indent_stack[self.indent_stack.len() - 2]
2291 && self.most_recent_token_is_value_separator()
2292 {
2293 return Err(Error::scan(
2294 self.position,
2295 "Tag at line-start with insufficient indent for value position"
2296 .to_string(),
2297 ));
2298 }
2299 if self.flow_level == 0
2300 && self.position.column == self.current_indent + 1
2301 && self.check_for_mapping_ahead()
2302 {
2303 self.maybe_open_block_mapping_for_key()?;
2304 }
2305 let token = self.scan_tag()?;
2306 self.tokens.push(token);
2307 }
2308
2309 // Plain scalars
2310 _ if self.is_plain_scalar_start() => {
2311 // A plain scalar starting on the SAME line as a
2312 // flow-collection close (\`}\` or \`]\`) means there's
2313 // no separator between the closed flow node and
2314 // the new content (yaml-test-suite 62EZ
2315 // \`x: { y: z }in: valid\`).
2316 if self.flow_level == 0 {
2317 if let Some(last) = self.tokens.last() {
2318 if matches!(
2319 last.token_type,
2320 TokenType::FlowMappingEnd | TokenType::FlowSequenceEnd
2321 ) && last.end_position.line == self.position.line
2322 {
2323 return Err(Error::scan(
2324 self.position,
2325 "Plain scalar immediately after flow collection close"
2326 .to_string(),
2327 ));
2328 }
2329 }
2330 }
2331 if self.flow_level == 0 && self.check_for_mapping_ahead() {
2332 self.maybe_open_block_mapping_for_key()?;
2333 }
2334
2335 let token = self.scan_plain_scalar()?;
2336 self.tokens.push(token);
2337 }
2338
2339 _ => {
2340 let context = ErrorContext::from_input(&self.input, &self.position, 2)
2341 .with_suggestion("Check for valid YAML syntax characters".to_string());
2342 return Err(Error::invalid_character_with_context(
2343 self.position,
2344 ch,
2345 "YAML document",
2346 context,
2347 ));
2348 }
2349 }
2350 }
2351
2352 // Inline sequences (nested \`- -\` on one line) used to be
2353 // closed unconditionally at end-of-line. But a nested sequence
2354 // can span lines (`- - a\n - b\n- c`) — in that case the inner
2355 // sequence must remain open until handle_indentation sees a
2356 // dedent. Reset the inline-sequence counter (so the next line
2357 // is judged on its own merits) but DO NOT emit BlockEnd —
2358 // handle_indentation's indent_stack pop, the end-of-stream
2359 // close at scan_next_token, and the explicit-dedent close at
2360 // handle_indentation's bottom each provide a correct close.
2361 self.inline_sequence_depth = 0;
2362
2363 Ok(())
2364 }
2365
2366 /// Scan the next token lazily
2367 fn scan_next_token(&mut self) -> Result<()> {
2368 if self.done {
2369 return Ok(());
2370 }
2371
2372 // Add stream start token if this is the beginning
2373 if self.tokens.is_empty() {
2374 self.tokens
2375 .push(Token::simple(TokenType::StreamStart, self.position));
2376 return Ok(());
2377 }
2378
2379 // Check if we're at the end of input
2380 if self.current_char.is_none() {
2381 if !self
2382 .tokens
2383 .iter()
2384 .any(|t| matches!(t.token_type, TokenType::StreamEnd))
2385 {
2386 self.tokens
2387 .push(Token::simple(TokenType::StreamEnd, self.position));
2388 }
2389 self.done = true;
2390 return Ok(());
2391 }
2392
2393 // For now, fall back to scanning all tokens at once for the lazy scanner
2394 // This is a simplified implementation - a full streaming parser would
2395 // need more sophisticated state management
2396 let tokens_before = self.tokens.len();
2397 self.scan_all_tokens()?;
2398
2399 // Mark as done after scanning all tokens
2400 if self.tokens.len() == tokens_before {
2401 self.done = true;
2402 }
2403
2404 Ok(())
2405 }
2406
2407 /// Pre-scan all tokens (simplified approach for basic implementation)
2408 fn scan_all_tokens(&mut self) -> Result<()> {
2409 // Only add StreamStart if we don't have it yet
2410 if !self
2411 .tokens
2412 .iter()
2413 .any(|t| matches!(t.token_type, TokenType::StreamStart))
2414 {
2415 self.tokens
2416 .push(Token::simple(TokenType::StreamStart, self.position));
2417 }
2418
2419 while self.current_char.is_some() {
2420 self.process_line()?;
2421
2422 // Advance past newlines
2423 while let Some(ch) = self.current_char {
2424 if ch == '\n' || ch == '\r' {
2425 self.advance();
2426 } else {
2427 break;
2428 }
2429 }
2430 }
2431
2432 // Close any remaining compact sequences (before their parent mappings)
2433 while self.compact_sequence_indents.pop().is_some() {
2434 self.tokens
2435 .push(Token::simple(TokenType::BlockEnd, self.position));
2436 }
2437
2438 // Close any remaining blocks
2439 while self.indent_stack.len() > 1 {
2440 self.indent_stack.pop();
2441 self.indent_is_sequence.pop();
2442 self.tokens
2443 .push(Token::simple(TokenType::BlockEnd, self.position));
2444 }
2445
2446 self.tokens
2447 .push(Token::simple(TokenType::StreamEnd, self.position));
2448 self.done = true;
2449 Ok(())
2450 }
2451
2452 /// Peek at a character at the given offset (can be negative)
2453 /// Check if the current position starts a pure number (digits/dots/minus only,
2454 /// not followed by letters). Values like 500m, 128Mi should be treated as plain scalars.
2455 fn is_pure_number(&self) -> bool {
2456 let mut offset: isize = 0;
2457 let first = self.peek_char(0);
2458 // Skip leading minus
2459 if first == Some('-') {
2460 offset = 1;
2461 }
2462 // Scan digits and at most one dot
2463 let mut has_digit = false;
2464 let mut dot_count = 0;
2465 loop {
2466 match self.peek_char(offset) {
2467 Some(c) if c.is_ascii_digit() => {
2468 has_digit = true;
2469 offset += 1;
2470 }
2471 Some('.') => {
2472 dot_count += 1;
2473 if dot_count > 1 {
2474 // Multiple dots (e.g. 0.5.8) — not a number
2475 return false;
2476 }
2477 offset += 1;
2478 }
2479 Some(c) if c.is_ascii_alphabetic() || c == '_' => {
2480 // Letters follow the digits — not a pure number (e.g. 500m, 128Mi)
2481 return false;
2482 }
2483 Some(c) => {
2484 // For a token to be a pure number, what follows
2485 // the digits must be end-of-token. In flow
2486 // context that's a flow indicator. In block
2487 // context the rest of the line must be pure
2488 // whitespace (possibly trailing a comment) — if
2489 // there's more non-whitespace content on this
2490 // line, the digits are part of a larger plain
2491 // scalar like \`1 - 3\` (yaml-test-suite P76L)
2492 // or \`20:03:20\` (yaml-test-suite U9NS).
2493 if self.flow_level > 0 && ",[]{}".contains(c) {
2494 return has_digit;
2495 }
2496 if c == '\n' || c == '\r' {
2497 return has_digit;
2498 }
2499 if c == ' ' || c == '\t' {
2500 // Look ahead: rest of line must be whitespace
2501 // or a comment.
2502 let mut probe = offset + 1;
2503 loop {
2504 match self.peek_char(probe) {
2505 None => return has_digit,
2506 Some('\n' | '\r') => return has_digit,
2507 Some('#') => return has_digit,
2508 Some(' ' | '\t') => probe += 1,
2509 Some(_) => return false,
2510 }
2511 }
2512 }
2513 if c == ':' {
2514 let next = self.peek_char(offset + 1);
2515 return has_digit && next.map_or(true, |nc| nc.is_whitespace());
2516 }
2517 return false;
2518 }
2519 None => return has_digit,
2520 }
2521 }
2522 }
2523
2524 fn peek_char(&self, offset: isize) -> Option<char> {
2525 if offset >= 0 {
2526 let target_index = self.current_char_index + offset as usize;
2527 if target_index < self.char_cache.len() {
2528 Some(self.char_cache[target_index])
2529 } else {
2530 None
2531 }
2532 } else {
2533 let offset_magnitude = (-offset) as usize;
2534 if self.current_char_index >= offset_magnitude {
2535 Some(self.char_cache[self.current_char_index - offset_magnitude])
2536 } else {
2537 None
2538 }
2539 }
2540 }
2541
2542 /// Scan an anchor token (&name)
2543 fn scan_anchor(&mut self) -> Result<Token> {
2544 let start_pos = self.position;
2545 self.advance(); // Skip '&'
2546
2547 let name = self.scan_identifier()?;
2548 if name.is_empty() {
2549 let context = ErrorContext::from_input(&self.input, &self.position, 2).with_suggestion(
2550 "Provide a valid anchor name after &, e.g., &anchor_name".to_string(),
2551 );
2552 return Err(Error::scan_with_context(
2553 self.position,
2554 "Anchor name cannot be empty",
2555 context,
2556 ));
2557 }
2558
2559 // Track anchor for resource limits
2560 self.resource_tracker.add_anchor(&self.limits)?;
2561
2562 Ok(Token::new(
2563 TokenType::Anchor(name),
2564 start_pos,
2565 self.position,
2566 ))
2567 }
2568
2569 /// Scan an alias token (*name)
2570 fn scan_alias(&mut self) -> Result<Token> {
2571 let start_pos = self.position;
2572 self.advance(); // Skip '*'
2573
2574 let name = self.scan_identifier()?;
2575 if name.is_empty() {
2576 let context = ErrorContext::from_input(&self.input, &self.position, 2).with_suggestion(
2577 "Provide a valid alias name after *, e.g., *alias_name".to_string(),
2578 );
2579 return Err(Error::scan_with_context(
2580 self.position,
2581 "Alias name cannot be empty",
2582 context,
2583 ));
2584 }
2585
2586 Ok(Token::new(TokenType::Alias(name), start_pos, self.position))
2587 }
2588
2589 /// Scan an identifier (used for anchor and alias names)
2590 fn scan_identifier(&mut self) -> Result<String> {
2591 // Per YAML 1.2 §6.9.2 (ns-anchor-name = ns-anchor-char+), the only
2592 // exclusions are whitespace and the flow indicators `,[]{}`. This
2593 // accepts ASCII alphanumeric, underscore, hyphen, AND full unicode
2594 // codepoints (including emoji), matching the spec exactly.
2595 let mut identifier = String::new();
2596 while let Some(ch) = self.current_char {
2597 if ch.is_whitespace() || matches!(ch, ',' | '[' | ']' | '{' | '}') {
2598 break;
2599 }
2600 identifier.push(ch);
2601 self.advance();
2602 }
2603 Ok(identifier)
2604 }
2605
2606 /// Scan a tag token (`!tag`, `!!tag`, or `!<verbatim>`).
2607 fn scan_tag(&mut self) -> Result<Token> {
2608 let start_pos = self.position;
2609 self.advance(); // Skip first '!'
2610
2611 let mut tag = String::from("!");
2612
2613 // Check for verbatim tag format: !<tag>
2614 if self.current_char == Some('<') {
2615 tag.push('<');
2616 self.advance(); // Skip '<'
2617
2618 // Scan until closing '>'
2619 while let Some(ch) = self.current_char {
2620 if ch == '>' {
2621 tag.push(ch);
2622 self.advance();
2623 break;
2624 } else if ch.is_control() || ch.is_whitespace() {
2625 return Err(Error::scan(
2626 self.position,
2627 "Invalid character in verbatim tag".to_string(),
2628 ));
2629 }
2630 tag.push(ch);
2631 self.advance();
2632 }
2633 } else {
2634 // Check for secondary tag handle: !!
2635 if self.current_char == Some('!') {
2636 tag.push('!');
2637 self.advance(); // Skip second '!'
2638 }
2639
2640 // Scan tag name/suffix.
2641 //
2642 // Per YAML 1.2 §5.6, tag suffixes are URI references — they may
2643 // contain any URI character (RFC 3986 unreserved + sub-delims +
2644 // a few others) or `%XX` percent-encoded bytes. The handful of
2645 // characters listed below covers the alphanumeric + URI-safe
2646 // punctuation set used by yaml-test-suite. Percent decoding of
2647 // `%XX` happens later in `TagResolver::resolve`.
2648 //
2649 // §5.3: inside a flow collection, the flow indicators
2650 // `,`, `[`, `]`, `{`, `}` always terminate a node — so we
2651 // must NOT consume them into the tag suffix even though
2652 // RFC 3986 permits them in URIs (yaml-test-suite WZ62).
2653 // YAML 1.2 in practice treats `,` as a flow indicator that
2654 // must be percent-encoded (\`%2C\`) when it appears inside
2655 // a tag suffix — bare \`,\` is not allowed in EITHER block
2656 // or flow context (yaml-test-suite U99R).
2657 while let Some(ch) = self.current_char {
2658 if matches!(ch, ',') {
2659 break;
2660 }
2661 if self.flow_level > 0 && matches!(ch, '[' | ']' | '{' | '}') {
2662 break;
2663 }
2664 // §6.8 / §5.6: `:` IS a valid tag URI character — e.g.
2665 // `tag:yaml.org,2002:str` legitimately contains two
2666 // colons inside its URI. But a `:` followed by
2667 // whitespace, EOL or EOF is the YAML mapping-value
2668 // indicator and MUST terminate the tag, otherwise
2669 // `!handle!suffix: value` is mis-scanned as
2670 // `Tag("!handle!suffix:") Scalar("value")` and the
2671 // implicit-key mapping structure is lost. Mirrors the
2672 // `,` carve-out above (a valid URI char that's also a
2673 // YAML flow indicator in some contexts).
2674 if ch == ':' {
2675 match self.peek_char(1) {
2676 None => break,
2677 Some(c) if c.is_whitespace() => break,
2678 _ => {}
2679 }
2680 }
2681 if ch.is_alphanumeric() || "-._~:/?#[]@!$&'()*+;=%".contains(ch) {
2682 tag.push(ch);
2683 self.advance();
2684 } else {
2685 break;
2686 }
2687 }
2688 }
2689
2690 Ok(Token::new(TokenType::Tag(tag), start_pos, self.position))
2691 }
2692
2693 /// Scan a literal block scalar (|)
2694 fn scan_literal_block_scalar(&mut self) -> Result<Token> {
2695 let start_pos = self.position;
2696 self.advance(); // Skip '|'
2697
2698 // Parse block scalar header (indicators like +, -, explicit indent)
2699 let (chomping, explicit_indent) = self.scan_block_scalar_header()?;
2700
2701 // Skip to next line
2702 self.skip_to_next_line()?;
2703
2704 // Determine indentation. `base_indent` is the surrounding
2705 // block's indent — i.e. the indent of the sequence or
2706 // mapping that contains this scalar. `self.current_indent`
2707 // is sometimes set to the inline indicator column (e.g. 2
2708 // for `- |`), which would make `base_indent + explicit`
2709 // wrong; use the top of `indent_stack` instead
2710 // (yaml-test-suite 4QFQ `|1`).
2711 let base_indent = self.indent_stack.last().copied().unwrap_or(0);
2712 let content_indent = if let Some(explicit) = explicit_indent {
2713 base_indent + explicit
2714 } else {
2715 // Find the first non-empty content line to determine indentation
2716 self.find_block_scalar_indent(base_indent)?
2717 };
2718
2719 // Collect the literal block content
2720 let content = self.collect_literal_block_content(content_indent, chomping)?;
2721
2722 Ok(Token::new(
2723 TokenType::BlockScalarLiteral(content),
2724 start_pos,
2725 self.position,
2726 ))
2727 }
2728
2729 /// Scan a folded block scalar (>)
2730 fn scan_folded_block_scalar(&mut self) -> Result<Token> {
2731 let start_pos = self.position;
2732 self.advance(); // Skip '>'
2733
2734 // Parse block scalar header (indicators like +, -, explicit indent)
2735 let (chomping, explicit_indent) = self.scan_block_scalar_header()?;
2736
2737 // Skip to next line
2738 self.skip_to_next_line()?;
2739
2740 // See scan_literal_block_scalar for why we read `indent_stack`
2741 // rather than `current_indent`.
2742 let base_indent = self.indent_stack.last().copied().unwrap_or(0);
2743 let content_indent = if let Some(explicit) = explicit_indent {
2744 base_indent + explicit
2745 } else {
2746 // Find the first non-empty content line to determine indentation
2747 self.find_block_scalar_indent(base_indent)?
2748 };
2749
2750 // Collect the folded block content
2751 let content = self.collect_folded_block_content(content_indent, chomping)?;
2752
2753 Ok(Token::new(
2754 TokenType::BlockScalarFolded(content),
2755 start_pos,
2756 self.position,
2757 ))
2758 }
2759
2760 /// Parse block scalar header indicators (+, -, and explicit indent)
2761 fn scan_block_scalar_header(&mut self) -> Result<(ChompingMode, Option<usize>)> {
2762 let mut chomping = ChompingMode::Clip;
2763 let mut explicit_indent: Option<usize> = None;
2764 // §6.6: a comment must be preceded by whitespace. \`|#x\` and
2765 // \`>#x\` are invalid (yaml-test-suite X4QW).
2766 let mut seen_separator_ws = false;
2767
2768 // Parse indicators in any order
2769 while let Some(ch) = self.current_char {
2770 match ch {
2771 '+' => {
2772 chomping = ChompingMode::Keep;
2773 self.advance();
2774 }
2775 '-' => {
2776 chomping = ChompingMode::Strip;
2777 self.advance();
2778 }
2779 '0'..='9' => {
2780 let digit = ch.to_digit(10).unwrap() as usize;
2781 if explicit_indent.is_some() {
2782 let context = ErrorContext::from_input(&self.input, &self.position, 2)
2783 .with_suggestion(
2784 "Use only one indent indicator digit in block scalar".to_string(),
2785 );
2786 return Err(Error::scan_with_context(
2787 self.position,
2788 "Multiple indent indicators in block scalar",
2789 context,
2790 ));
2791 }
2792 // YAML 1.2 §8.1.1.1: explicit indent indicator is
2793 // 1..=9. `|0` and `>0` are invalid
2794 // (yaml-test-suite 2G84/00).
2795 if digit == 0 {
2796 let context = ErrorContext::from_input(&self.input, &self.position, 2)
2797 .with_suggestion(
2798 "Block-scalar indent indicator must be 1-9".to_string(),
2799 );
2800 return Err(Error::scan_with_context(
2801 self.position,
2802 "Block-scalar indent indicator `0` is invalid",
2803 context,
2804 ));
2805 }
2806 explicit_indent = Some(digit);
2807 self.advance();
2808 }
2809 ' ' | '\t' => {
2810 seen_separator_ws = true;
2811 self.advance(); // Skip whitespace
2812 }
2813 '#' => {
2814 if !seen_separator_ws {
2815 return Err(Error::scan(
2816 self.position,
2817 "Comment in block-scalar header must be preceded by whitespace"
2818 .to_string(),
2819 ));
2820 }
2821 // Skip comment to end of line
2822 while let Some(ch) = self.current_char {
2823 self.advance();
2824 if ch == '\n' || ch == '\r' {
2825 break;
2826 }
2827 }
2828 break;
2829 }
2830 '\n' | '\r' => break,
2831 _ => {
2832 let context = ErrorContext::from_input(&self.input, &self.position, 2)
2833 .with_suggestion("Use valid block scalar indicators: | (literal), > (folded), + (keep), - (strip), or digit (indent)".to_string());
2834 return Err(Error::invalid_character_with_context(
2835 self.position,
2836 ch,
2837 "block scalar header",
2838 context,
2839 ));
2840 }
2841 }
2842 }
2843
2844 Ok((chomping, explicit_indent))
2845 }
2846
2847 /// Advance the cursor PAST the next line break, but do not consume
2848 /// any leading whitespace on the line that follows. The block-
2849 /// scalar header parser uses this to step from the indicator line
2850 /// to the start of the content line — the next line's leading
2851 /// spaces are part of its content_indent, not header whitespace.
2852 fn skip_to_next_line(&mut self) -> Result<()> {
2853 // If we're already at column 1 (the comment handler in
2854 // scan_block_scalar_header may have already advanced past a
2855 // newline), do nothing — the next line's leading whitespace
2856 // belongs to its content_indent.
2857 if self.position.column == 1 {
2858 return Ok(());
2859 }
2860 while let Some(ch) = self.current_char {
2861 match ch {
2862 '\n' | '\r' => {
2863 self.advance();
2864 return Ok(());
2865 }
2866 ' ' | '\t' => {
2867 self.advance();
2868 }
2869 _ => return Ok(()),
2870 }
2871 }
2872 Ok(())
2873 }
2874
2875 /// Find the content indentation for a block scalar.
2876 ///
2877 /// Per spec §8.1.1.1, indent is the leading-space count of the first
2878 /// non-empty content line (or the longest blank-line indent if no
2879 /// non-empty line exists). A non-empty line whose indent is not
2880 /// strictly deeper than `base_indent` is outside the scalar's
2881 /// scope — that line is a sibling structure, not content
2882 /// (yaml-test-suite K858).
2883 fn find_block_scalar_indent(&mut self, base_indent: usize) -> Result<usize> {
2884 let saved_position = self.position;
2885 let saved_char = self.current_char;
2886 let saved_char_index = self.current_char_index;
2887
2888 let mut max_blank_indent: usize = 0;
2889 let mut found = false;
2890 let mut content_indent: usize = 1;
2891
2892 loop {
2893 let mut line_indent = 0;
2894 while self.current_char == Some(' ') {
2895 line_indent += 1;
2896 self.advance();
2897 }
2898 // §6.1 + §8.1: tabs cannot serve as block-scalar
2899 // indentation. A line that BEGINS with a tab (no leading
2900 // spaces) inside the block scalar's indent search is
2901 // invalid (yaml-test-suite Y79Y/000 \`foo: |\\n\\tbar\`).
2902 // Tabs that appear AFTER one or more spaces are content,
2903 // not indentation, and remain valid (yaml-test-suite
2904 // 96NN/00 \`foo: |-\\n \\tbar\`).
2905 if line_indent == 0 && self.current_char == Some('\t') {
2906 return Err(Error::scan(
2907 self.position,
2908 "Tab cannot serve as block-scalar indentation".to_string(),
2909 ));
2910 }
2911
2912 match self.current_char {
2913 None => {
2914 if line_indent > max_blank_indent {
2915 max_blank_indent = line_indent;
2916 }
2917 break;
2918 }
2919 Some('\n' | '\r') => {
2920 if line_indent > max_blank_indent {
2921 max_blank_indent = line_indent;
2922 }
2923 self.advance();
2924 // fall through to next iteration
2925 }
2926 Some(_) => {
2927 // If we're nested inside another block — either
2928 // via the `indent_stack` (normal mapping/sequence
2929 // open) or `compact_sequence_indents` (a
2930 // compact block sequence at the same indent as
2931 // its parent) — and this candidate line is not
2932 // strictly deeper than base_indent, it's a
2933 // sibling outside the scalar's scope (yaml-test-
2934 // suite K858, P2AD).
2935 let inside_block =
2936 self.indent_stack.len() > 1 || !self.compact_sequence_indents.is_empty();
2937 if inside_block && line_indent <= base_indent {
2938 content_indent = max_blank_indent.max(base_indent + 1);
2939 } else {
2940 content_indent = line_indent;
2941 }
2942 // §8.1.2.1: leading blank lines may not exceed the
2943 // detected content indent — that ambiguity is
2944 // invalid (yaml-test-suite W9L4, S98Z).
2945 if max_blank_indent > content_indent {
2946 self.position = saved_position;
2947 self.current_char = saved_char;
2948 self.current_char_index = saved_char_index;
2949 return Err(Error::scan(
2950 self.position,
2951 "Block scalar leading blank-line indent exceeds content indent"
2952 .to_string(),
2953 ));
2954 }
2955 found = true;
2956 break;
2957 }
2958 }
2959 }
2960
2961 if !found {
2962 content_indent = max_blank_indent;
2963 }
2964
2965 self.position = saved_position;
2966 self.current_char = saved_char;
2967 self.current_char_index = saved_char_index;
2968
2969 Ok(content_indent)
2970 }
2971
2972 /// Count indentation at start of current line
2973 fn count_line_indent(&mut self) -> usize {
2974 let mut indent = 0;
2975 let saved_position = self.position;
2976 let saved_char = self.current_char;
2977 let saved_char_index = self.current_char_index;
2978
2979 while let Some(ch) = self.current_char {
2980 if ch == ' ' {
2981 indent += 1;
2982 self.advance();
2983 } else if ch == '\t' {
2984 indent += 8; // Tab counts as 8 spaces
2985 self.advance();
2986 } else {
2987 break;
2988 }
2989 }
2990
2991 // Restore position
2992 self.position = saved_position;
2993 self.current_char = saved_char;
2994 self.current_char_index = saved_char_index;
2995
2996 indent
2997 }
2998
2999 /// Collect content for a literal block scalar.
3000 ///
3001 /// Each line is preserved with its terminating newline. After collection
3002 /// we apply the chomping mode per spec §8.1.1.2.
3003 fn collect_literal_block_content(
3004 &mut self,
3005 content_indent: usize,
3006 chomping: ChompingMode,
3007 ) -> Result<String> {
3008 let mut content = String::new();
3009
3010 loop {
3011 // Count current line's leading-space indent.
3012 let mut line_indent = 0;
3013 let save_pos = self.position;
3014 let save_ch = self.current_char;
3015 let save_idx = self.current_char_index;
3016 while self.current_char == Some(' ') {
3017 line_indent += 1;
3018 self.advance();
3019 }
3020
3021 let line_is_blank = matches!(self.current_char, Some('\n' | '\r') | None);
3022
3023 if !line_is_blank && line_indent < content_indent {
3024 // Non-empty line with less indent ends the scalar; rewind.
3025 self.position = save_pos;
3026 self.current_char = save_ch;
3027 self.current_char_index = save_idx;
3028 break;
3029 }
3030
3031 // Document marker at line start always ends the scalar,
3032 // regardless of content_indent (allows zero-indented
3033 // block scalars per yaml-test-suite FP8R).
3034 if line_indent == 0 && self.is_doc_marker_here() {
3035 self.position = save_pos;
3036 self.current_char = save_ch;
3037 self.current_char_index = save_idx;
3038 break;
3039 }
3040
3041 if line_is_blank {
3042 // A blank line counts when there's an actual line break
3043 // to consume. EOF after we've consumed some whitespace
3044 // on the trailing line ALSO counts as one final blank
3045 // line (yaml-test-suite JEF9/02: `- |+\n `).
3046 if matches!(self.current_char, Some('\n' | '\r')) {
3047 // Whitespace beyond content_indent is literal content
3048 // even on blank lines (yaml-test-suite 6FWR).
3049 for _ in content_indent..line_indent {
3050 content.push(' ');
3051 }
3052 content.push('\n');
3053 self.advance();
3054 continue;
3055 }
3056 if line_indent > 0 {
3057 for _ in content_indent..line_indent {
3058 content.push(' ');
3059 }
3060 content.push('\n');
3061 }
3062 break;
3063 }
3064
3065 // Content line: we already consumed `line_indent` spaces, but
3066 // only `content_indent` of them belong to indentation. Any
3067 // extra leading spaces are literal content.
3068 let mut line = String::new();
3069 for _ in content_indent..line_indent {
3070 line.push(' ');
3071 }
3072 while let Some(ch) = self.current_char {
3073 if ch == '\n' || ch == '\r' {
3074 self.advance();
3075 break;
3076 }
3077 line.push(ch);
3078 self.advance();
3079 }
3080 content.push_str(&line);
3081 content.push('\n');
3082
3083 if self.current_char.is_none() {
3084 break;
3085 }
3086 }
3087
3088 Ok(apply_chomping(content, chomping))
3089 }
3090
3091 /// Check if cursor is at `---` or `...` followed by whitespace/EOL.
3092 fn is_doc_marker_here(&self) -> bool {
3093 let c0 = self.current_char;
3094 let c1 = self.peek_char(1);
3095 let c2 = self.peek_char(2);
3096 let c3 = self.peek_char(3);
3097 let trailing_ok = c3.map_or(true, |c| c.is_whitespace());
3098 (c0 == Some('-') && c1 == Some('-') && c2 == Some('-') && trailing_ok)
3099 || (c0 == Some('.') && c1 == Some('.') && c2 == Some('.') && trailing_ok)
3100 }
3101
3102 /// Collect content for a folded block scalar.
3103 ///
3104 /// Folding rules (§8.1.3): a sequence of single blank lines between
3105 /// equally-indented non-empty content lines collapses into a single
3106 /// space; runs of blank lines emit `n-1` newlines; more-indented
3107 /// lines preserve their newline boundaries. After collection, apply
3108 /// chomping (§8.1.1.2).
3109 fn collect_folded_block_content(
3110 &mut self,
3111 content_indent: usize,
3112 chomping: ChompingMode,
3113 ) -> Result<String> {
3114 #[derive(Clone, Copy, PartialEq, Eq)]
3115 enum LineKind {
3116 Normal,
3117 MoreIndented,
3118 Empty,
3119 }
3120 struct Line {
3121 text: String,
3122 kind: LineKind,
3123 }
3124
3125 let mut lines: Vec<Line> = Vec::new();
3126
3127 loop {
3128 let mut line_indent = 0;
3129 let save_pos = self.position;
3130 let save_ch = self.current_char;
3131 let save_idx = self.current_char_index;
3132 while self.current_char == Some(' ') {
3133 line_indent += 1;
3134 self.advance();
3135 }
3136
3137 let line_is_blank = matches!(self.current_char, Some('\n' | '\r') | None);
3138
3139 if !line_is_blank && line_indent < content_indent {
3140 self.position = save_pos;
3141 self.current_char = save_ch;
3142 self.current_char_index = save_idx;
3143 break;
3144 }
3145
3146 if line_indent == 0 && self.is_doc_marker_here() {
3147 self.position = save_pos;
3148 self.current_char = save_ch;
3149 self.current_char_index = save_idx;
3150 break;
3151 }
3152
3153 if line_is_blank {
3154 if matches!(self.current_char, Some('\n' | '\r')) {
3155 lines.push(Line {
3156 text: String::new(),
3157 kind: LineKind::Empty,
3158 });
3159 self.advance();
3160 continue;
3161 }
3162 break;
3163 }
3164
3165 // Capture extra-indent leading spaces as part of content.
3166 let mut text = String::new();
3167 for _ in content_indent..line_indent {
3168 text.push(' ');
3169 }
3170 while let Some(ch) = self.current_char {
3171 if ch == '\n' || ch == '\r' {
3172 self.advance();
3173 break;
3174 }
3175 text.push(ch);
3176 self.advance();
3177 }
3178 // §8.1.3.2: "more indented" means the content (after the
3179 // common indent strip) begins with extra whitespace —
3180 // either spaces or tabs (yaml-test-suite MJS9).
3181 let kind = if text.starts_with(' ') || text.starts_with('\t') {
3182 LineKind::MoreIndented
3183 } else {
3184 LineKind::Normal
3185 };
3186 lines.push(Line { text, kind });
3187
3188 if self.current_char.is_none() {
3189 break;
3190 }
3191 }
3192
3193 // Build the folded output.
3194 let mut content = String::new();
3195 let mut idx = 0;
3196 while idx < lines.len() {
3197 let line = &lines[idx];
3198 match line.kind {
3199 LineKind::Normal | LineKind::MoreIndented => {
3200 content.push_str(&line.text);
3201 // Lookahead: count immediately-following empty lines.
3202 let mut j = idx + 1;
3203 let mut empties = 0;
3204 while j < lines.len() && lines[j].kind == LineKind::Empty {
3205 empties += 1;
3206 j += 1;
3207 }
3208 if j < lines.len() {
3209 // Spec §8.1.3.2: folding behaviour depends on
3210 // whether either surrounding content line is
3211 // "more indented" than the content indent.
3212 // - both Normal, 0 empties → fold to space.
3213 // - both Normal, k empties → k newlines (one
3214 // break folded out).
3215 // - any MoreIndented, 0 empties → 1 newline.
3216 // - any MoreIndented, k empties → k+1 newlines
3217 // (every break preserved).
3218 let mi_adjacent = line.kind == LineKind::MoreIndented
3219 || lines[j].kind == LineKind::MoreIndented;
3220 if empties == 0 {
3221 if mi_adjacent {
3222 content.push('\n');
3223 } else {
3224 content.push(' ');
3225 }
3226 } else {
3227 let breaks = if mi_adjacent { empties + 1 } else { empties };
3228 for _ in 0..breaks {
3229 content.push('\n');
3230 }
3231 }
3232 idx = j;
3233 } else {
3234 // End of stream after content (possibly trailing empties).
3235 // Always emit final `\n` for the last content line; extra
3236 // trailing empties contribute additional `\n`s, and chomping
3237 // will trim them later if needed.
3238 content.push('\n');
3239 for _ in 0..empties {
3240 content.push('\n');
3241 }
3242 break;
3243 }
3244 }
3245 LineKind::Empty => {
3246 // Leading empty lines (no preceding content): emit as `\n`s.
3247 content.push('\n');
3248 idx += 1;
3249 }
3250 }
3251 }
3252
3253 Ok(apply_chomping(content, chomping))
3254 }
3255
3256 /// Emit a `BlockMappingStart` token if the current position is the
3257 /// start of an implicit key and no mapping is yet active at this
3258 /// indent level. Shared by plain and quoted scalar dispatch.
3259 fn maybe_open_block_mapping_for_key(&mut self) -> Result<()> {
3260 // Use `unwrap_or(0)` for parity with the indentation module's
3261 // helpers — defends against error-recovery pop paths that could
3262 // leave the stack momentarily empty (#18).
3263 let last_indent = self.indent_stack.last().copied().unwrap_or(0);
3264 let should_start_new_mapping = if self.current_indent > last_indent {
3265 true
3266 } else if self.current_indent == last_indent {
3267 !self.check_active_mapping_at_level(self.current_indent)
3268 } else {
3269 false
3270 };
3271 if should_start_new_mapping {
3272 // §6.1 + §8.22: opening a NEW block mapping at deeper
3273 // indent than the parent only makes sense if the parent
3274 // has a key WITHOUT a value (the new mapping IS that
3275 // value). If the parent's last content is a complete
3276 // (key, value) pair — i.e. the most recent meaningful
3277 // token is a value-position scalar/alias/close — then
3278 // there's no node to host the deeper mapping (yaml-test-
3279 // suite U44R: \`map:\\n key1: q\\n key2: bad\` — key2
3280 // is deeper than key1 but key1's value is already \`q\`).
3281 if self.current_indent > last_indent && last_indent > 0 {
3282 let mut depth = 0i32;
3283 let mut last_meaningful = None;
3284 for t in self.tokens.iter().rev() {
3285 match &t.token_type {
3286 TokenType::BlockEnd => depth += 1,
3287 TokenType::BlockMappingStart | TokenType::BlockSequenceStart => {
3288 if depth == 0 {
3289 break;
3290 }
3291 depth -= 1;
3292 }
3293 TokenType::Anchor(_) | TokenType::Tag(_) => {}
3294 other => {
3295 if depth == 0 {
3296 last_meaningful = Some(other.clone());
3297 break;
3298 }
3299 }
3300 }
3301 }
3302 if matches!(
3303 last_meaningful,
3304 Some(
3305 TokenType::Scalar(..)
3306 | TokenType::Alias(_)
3307 | TokenType::FlowSequenceEnd
3308 | TokenType::FlowMappingEnd
3309 | TokenType::BlockScalarLiteral(..)
3310 | TokenType::BlockScalarFolded(..)
3311 )
3312 ) {
3313 return Err(Error::scan(
3314 self.position,
3315 "Indentation increase has no parent in current mapping/sequence"
3316 .to_string(),
3317 ));
3318 }
3319 }
3320 self.indent_stack.push(self.current_indent);
3321 self.indent_is_sequence.push(false);
3322 self.resource_tracker
3323 .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
3324 self.tokens
3325 .push(Token::simple(TokenType::BlockMappingStart, self.position));
3326 }
3327 Ok(())
3328 }
3329
3330 /// Look ahead on the current line for a `:` that marks a mapping key.
3331 ///
3332 /// Per YAML 1.2 §7.3.3, a plain scalar may contain a `:` that is not
3333 /// followed by whitespace. Only `: ` terminates the scalar. If the
3334 /// line begins with `"` or `'`, the leading quoted scalar's contents
3335 /// are scanned past (including `''` and `\"` escapes) before looking
3336 /// for the `: ` that would make this scalar a key. This handles
3337 /// yaml-test-suite 6H3V (`'foo: bar\': baz'`) and 6SLA.
3338 /// For an alias/anchor at the current position, scan past
3339 /// the `&`/`*` and the name characters; if the FIRST char that
3340 /// would terminate the name is `:`, the colon is PART of the
3341 /// alias/anchor name (yaml-test-suite 2SXE). Returns true in
3342 /// that case so the caller can skip the implicit-key fast-path.
3343 fn colon_belongs_to_alias_anchor_name(&self) -> bool {
3344 // Start after the `&` / `*` introducer.
3345 let mut i = self.current_char_index + 1;
3346 let n = self.char_cache.len();
3347 // Per scan_identifier rules: stop at whitespace or flow indicator.
3348 while i < n {
3349 let c = self.char_cache[i];
3350 if c.is_whitespace() || matches!(c, ',' | '[' | ']' | '{' | '}') {
3351 break;
3352 }
3353 i += 1;
3354 }
3355 // If the next char (or last consumed?) at termination is `:`,
3356 // then the name ended with `:`. Look at the LAST consumed
3357 // char. Actually our scan_identifier accepts `:` as part of
3358 // name — so the colon is already in the name. There's no
3359 // separate "value indicator" colon after.
3360 //
3361 // For the implicit-key fast path to be wrong, we need the
3362 // name to END with `:` (last char of name is `:`).
3363 if i > self.current_char_index + 1 {
3364 let last_name_char = self.char_cache[i - 1];
3365 if last_name_char == ':' {
3366 return true;
3367 }
3368 }
3369 false
3370 }
3371
3372 /// Scan ahead on the current line (the rest of the post-indent
3373 /// content) to determine whether it looks like an implicit
3374 /// mapping key — i.e. has a `: ` separator (or `:` at line end)
3375 /// before any newline.
3376 fn line_after_indent_is_implicit_key(&self) -> bool {
3377 let mut i = self.current_char_index;
3378 let n = self.char_cache.len();
3379 while i < n {
3380 let ch = self.char_cache[i];
3381 if ch == '\n' || ch == '\r' {
3382 return false;
3383 }
3384 if ch == ':' {
3385 let next = self.char_cache.get(i + 1).copied();
3386 if next.is_none() || next.map_or(false, |c| c.is_whitespace()) {
3387 return true;
3388 }
3389 }
3390 i += 1;
3391 }
3392 false
3393 }
3394
3395 /// Walk back through recent tokens; if the last non-property
3396 /// token was `Value` (`:`), the parser is in value-expectation
3397 /// mode (key not yet matched with a value).
3398 fn most_recent_token_is_value_separator(&self) -> bool {
3399 for t in self.tokens.iter().rev() {
3400 match t.token_type {
3401 TokenType::Anchor(_) | TokenType::Tag(_) => {}
3402 TokenType::Value => return true,
3403 _ => return false,
3404 }
3405 }
3406 false
3407 }
3408
3409 fn check_for_mapping_ahead(&self) -> bool {
3410 let mut i = self.current_char_index;
3411 let n = self.char_cache.len();
3412 if i < n {
3413 let first = self.char_cache[i];
3414 if first == '\'' || first == '"' {
3415 let quote = first;
3416 i += 1;
3417 while i < n {
3418 let c = self.char_cache[i];
3419 if c == '\n' || c == '\r' {
3420 return false; // unterminated quote on line
3421 }
3422 if quote == '\'' && c == '\'' && self.char_cache.get(i + 1) == Some(&'\'') {
3423 // `''` is the in-string single-quote escape.
3424 i += 2;
3425 continue;
3426 }
3427 if quote == '"' && c == '\\' {
3428 // Skip the escaped char.
3429 i += 2;
3430 continue;
3431 }
3432 if c == quote {
3433 i += 1;
3434 break;
3435 }
3436 i += 1;
3437 }
3438 }
3439 }
3440 // Skip balanced flow collections — a `:` *inside* `[...]` or
3441 // `{...}` does NOT make the line a block-mapping key (the flow
3442 // collection itself can BE the key, but its inner colons are
3443 // part of its own structure). yaml-test-suite: `{key: v}` is
3444 // a standalone flow mapping; `[a]: outer` is a block-map key.
3445 let mut flow_depth: i32 = 0;
3446 while i < n {
3447 let ch = self.char_cache[i];
3448 match ch {
3449 '\n' | '\r' => return false,
3450 '[' | '{' => flow_depth += 1,
3451 ']' | '}' => flow_depth -= 1,
3452 ':' if flow_depth <= 0 => {
3453 let next = self.char_cache.get(i + 1).copied();
3454 match next {
3455 None => return true,
3456 Some(c) if c.is_whitespace() => return true,
3457 _ => {}
3458 }
3459 }
3460 _ => {}
3461 }
3462 i += 1;
3463 }
3464 false
3465 }
3466
3467 /// Check if there's an active mapping at the specified indentation level
3468 /// This method properly handles BlockEnd tokens by tracking mapping start/end pairs
3469 fn check_active_mapping_at_level(&self, _target_indent: usize) -> bool {
3470 let mut depth = 0;
3471
3472 // Walk backwards through tokens to find the innermost unmatched block start.
3473 // Every BlockEnd increments depth; BlockMappingStart and BlockSequenceStart
3474 // decrement it (both open blocks that need a matching BlockEnd).
3475 // When depth == 0 we have found the block start that is still "open".
3476 for token in self.tokens.iter().rev() {
3477 match &token.token_type {
3478 TokenType::BlockMappingStart => {
3479 if depth == 0 {
3480 // The innermost open block is a mapping — active at this level.
3481 return true;
3482 }
3483 depth -= 1;
3484 }
3485 TokenType::BlockSequenceStart => {
3486 if depth == 0 {
3487 // The innermost open block is a sequence, not a mapping.
3488 return false;
3489 }
3490 depth -= 1;
3491 }
3492 TokenType::BlockEnd => {
3493 depth += 1;
3494 }
3495 TokenType::StreamStart | TokenType::DocumentStart | TokenType::DocumentEnd => {
3496 // Stop at document boundaries
3497 break;
3498 }
3499 _ => {}
3500 }
3501 }
3502
3503 false
3504 }
3505}
3506
3507impl Scanner for BasicScanner {
3508 fn check_token(&self) -> bool {
3509 // For lazy scanning: check if we have cached tokens or can generate more
3510 self.token_index < self.tokens.len() || !self.done
3511 }
3512
3513 fn peek_token(&self) -> Result<Option<&Token>> {
3514 // This is a bit tricky with lazy scanning since peek shouldn't mutate
3515 // For now, return cached token if available
3516 Ok(self.tokens.get(self.token_index))
3517 }
3518
3519 fn get_token(&mut self) -> Result<Option<Token>> {
3520 // If we need more tokens and haven't finished, scan next token
3521 if self.token_index >= self.tokens.len() && !self.done {
3522 self.scan_next_token()?;
3523 }
3524
3525 if self.token_index < self.tokens.len() {
3526 let token = self.tokens[self.token_index].clone();
3527 self.token_index += 1;
3528 Ok(Some(token))
3529 } else {
3530 Ok(None)
3531 }
3532 }
3533
3534 fn reset(&mut self) {
3535 self.token_index = 0;
3536 self.position = Position::start();
3537 self.tokens.clear();
3538 self.done = false;
3539 self.current_char = self.input.chars().next();
3540 self.indent_stack = vec![0];
3541 self.current_indent = 0;
3542 self.flow_level = 0;
3543 self.detected_indent_style = None;
3544 self.indent_samples.clear();
3545 self.previous_indent_level = 0;
3546 self.current_char_index = 0;
3547 self.current_char = self.char_cache.first().copied();
3548 }
3549
3550 fn position(&self) -> Position {
3551 self.position
3552 }
3553
3554 fn input(&self) -> &str {
3555 &self.input
3556 }
3557}
3558
3559#[cfg(test)]
3560mod tests {
3561 use super::*;
3562
3563 /// Regression for #19. Reaching this constructor with malformed input
3564 /// must record the scanning error so callers can detect failure via
3565 /// `has_scanning_error()`. Previously the result of `scan_all_tokens`
3566 /// was dropped, silently truncating the token stream.
3567 #[test]
3568 fn new_eager_with_comments_propagates_scanning_errors() {
3569 // A doc-start marker inside an unterminated quoted scalar is a
3570 // scanning error (see `Error::scan(... "inside quoted scalar")`).
3571 // First confirm the non-comment constructor reports it — that
3572 // anchors the parity check.
3573 let input = "\"abc\n---\n";
3574 let plain = BasicScanner::new_eager(input.to_string());
3575 assert!(
3576 plain.has_scanning_error(),
3577 "precondition: malformed input must produce a scanning error via new_eager"
3578 );
3579
3580 let with_comments = BasicScanner::new_eager_with_comments(input.to_string());
3581 assert!(
3582 with_comments.has_scanning_error(),
3583 "new_eager_with_comments must NOT silently swallow scanner errors"
3584 );
3585 }
3586
3587 /// Drive the parser pipeline on `input` in a dedicated thread, returning
3588 /// `None` if it doesn't finish within `Duration::from_secs(2)`. Used by
3589 /// regression tests for parser hangs so a still-broken parser doesn't
3590 /// block the whole `cargo test` run.
3591 fn parse_with_timeout(input: &str) -> Option<Vec<crate::parser::Event>> {
3592 use crate::parser::{BasicParser, Parser as ParserTrait};
3593 use std::sync::mpsc;
3594 use std::thread;
3595 use std::time::Duration;
3596
3597 let owned = input.to_string();
3598 let (tx, rx) = mpsc::channel();
3599 thread::spawn(move || {
3600 let mut p = BasicParser::new_eager(owned);
3601 let _ = p.take_scanning_error();
3602 let mut events = Vec::new();
3603 loop {
3604 match p.get_event() {
3605 Ok(Some(ev)) => events.push(ev),
3606 Ok(None) => break,
3607 Err(_) => break,
3608 }
3609 }
3610 let _ = tx.send(events);
3611 });
3612 rx.recv_timeout(Duration::from_secs(2)).ok()
3613 }
3614
3615 /// Regression: `---` directly followed by non-space text used to spin the
3616 /// scanner forever because the `-` match arm at line-start dispatched to
3617 /// `scan_document_start` (which correctly returned None) and then to
3618 /// `is_plain_scalar_start` (which returns false for `-`, so no consumption
3619 /// occurred — outer `while let` re-entered with the same char). Fix:
3620 /// fall through to `scan_plain_scalar` unconditionally when not a doc
3621 /// marker — the guard already ensures the char is non-whitespace.
3622 /// See yaml-test-suite tests 82AN / EXG3.
3623 #[test]
3624 fn three_dashes_directly_followed_by_text_does_not_hang() {
3625 let events = parse_with_timeout("---word1\nword2\n")
3626 .expect("parser hung — `---word1` should not produce an infinite loop");
3627 // We must produce at least one scalar whose value starts with `---`,
3628 // proving that the dashes were consumed as part of a plain scalar
3629 // (not interpreted as a document marker, which would consume them
3630 // separately).
3631 let starts_with_dashes = events.iter().any(|e| {
3632 matches!(&e.event_type,
3633 crate::parser::EventType::Scalar { value, .. } if value.starts_with("---")
3634 )
3635 });
3636 assert!(
3637 starts_with_dashes,
3638 "expected a plain scalar starting with `---`, got events: {events:?}"
3639 );
3640 }
3641
3642 /// YAML 1.2 §7.3.3: `?`, `:`, and `-` may start a plain scalar provided
3643 /// the next character is non-space (and, in flow context, not a flow
3644 /// indicator). The previous `is_plain_scalar_start` unconditionally
3645 /// rejected those three characters, so plain scalars like `?foo`,
3646 /// `:foo`, `-foo` were reported as `Invalid character`.
3647 /// Tracked by yaml-test-suite 2EBW.
3648 #[test]
3649 fn question_mark_followed_by_text_starts_plain_scalar() {
3650 use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3651 let mut p = BasicParser::new_eager("?foo: bar\n".to_string());
3652 assert!(p.take_scanning_error().is_none());
3653 let mut keys = Vec::new();
3654 while let Ok(Some(ev)) = p.get_event() {
3655 if let EventType::Scalar { value, .. } = ev.event_type {
3656 keys.push(value);
3657 }
3658 }
3659 assert_eq!(keys, vec!["?foo", "bar"]);
3660 }
3661
3662 #[test]
3663 fn colon_followed_by_text_starts_plain_scalar() {
3664 use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3665 let mut p = BasicParser::new_eager(":foo: bar\n".to_string());
3666 assert!(p.take_scanning_error().is_none());
3667 let mut keys = Vec::new();
3668 while let Ok(Some(ev)) = p.get_event() {
3669 if let EventType::Scalar { value, .. } = ev.event_type {
3670 keys.push(value);
3671 }
3672 }
3673 assert_eq!(keys, vec![":foo", "bar"]);
3674 }
3675
3676 /// YAML 1.2: every started document must be closed with a DocumentEnd
3677 /// event before StreamEnd. The previous `TokenType::StreamEnd` handler
3678 /// only emitted `-DOC` for `DocumentContent` / `BlockNode` states —
3679 /// the `DocumentStart` state (entered after `---` and a single scalar
3680 /// like `"foo"`) was skipped, dropping the `-DOC` event. Affected by
3681 /// yaml-test-suite 27NA, 2G84/*, 2LFX and several others.
3682 #[test]
3683 fn explicit_doc_with_only_a_scalar_emits_doc_end_before_stream_end() {
3684 use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3685 let mut p = BasicParser::new_eager("---\n\"foo\"\n".to_string());
3686 assert!(p.take_scanning_error().is_none());
3687 let mut kinds = Vec::new();
3688 while let Ok(Some(ev)) = p.get_event() {
3689 kinds.push(match ev.event_type {
3690 EventType::StreamStart => "+STR",
3691 EventType::StreamEnd => "-STR",
3692 EventType::DocumentStart { .. } => "+DOC",
3693 EventType::DocumentEnd { .. } => "-DOC",
3694 EventType::Scalar { .. } => "=VAL",
3695 _ => "?",
3696 });
3697 }
3698 // Critical: -DOC must come before -STR.
3699 let doc_end_idx = kinds.iter().position(|s| *s == "-DOC");
3700 let str_end_idx = kinds.iter().position(|s| *s == "-STR");
3701 assert!(
3702 doc_end_idx.is_some(),
3703 "missing -DOC in event stream: {kinds:?}"
3704 );
3705 assert!(
3706 doc_end_idx < str_end_idx,
3707 "expected -DOC before -STR, got {kinds:?}"
3708 );
3709 }
3710
3711 /// YAML 1.2 §5.7 hex / Unicode escapes in double-quoted strings.
3712 #[test]
3713 fn double_quoted_hex_escapes_decode_to_codepoint() {
3714 use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3715 for (input, expected) in [
3716 (r#""\x41""#, "A"),
3717 (r#""é""#, "é"),
3718 (r#""\U0001F600""#, "\u{1f600}"),
3719 ] {
3720 let mut p = BasicParser::new_eager(input.to_string());
3721 assert!(
3722 p.take_scanning_error().is_none(),
3723 "no scan error for {input}"
3724 );
3725 let mut found = None;
3726 while let Ok(Some(ev)) = p.get_event() {
3727 if let EventType::Scalar { value, .. } = ev.event_type {
3728 found = Some(value);
3729 break;
3730 }
3731 }
3732 assert_eq!(found.as_deref(), Some(expected), "input {input}");
3733 }
3734 }
3735
3736 #[test]
3737 fn truncated_hex_escape_is_a_scan_error() {
3738 use crate::parser::BasicParser;
3739 let mut p = BasicParser::new_eager(r#""\x4""#.to_string());
3740 assert!(
3741 p.take_scanning_error().is_some(),
3742 "truncated \\x escape must error"
3743 );
3744 }
3745
3746 /// YAML 1.2 §5.7: double-quoted strings have a strict allowlist of escape
3747 /// sequences. `\.` (and any other unknown escape) must be reported as a
3748 /// scan error. Tracked by yaml-test-suite 55WF.
3749 #[test]
3750 fn invalid_double_quoted_escape_is_a_scan_error() {
3751 use crate::parser::{BasicParser, Parser as ParserTrait};
3752 let mut p = BasicParser::new_eager("---\n\"\\.\"\n".to_string());
3753 let scan_err = p.take_scanning_error();
3754 let mut parse_err = false;
3755 if scan_err.is_none() {
3756 loop {
3757 match p.get_event() {
3758 Ok(Some(_)) => {}
3759 Ok(None) => break,
3760 Err(_) => {
3761 parse_err = true;
3762 break;
3763 }
3764 }
3765 }
3766 }
3767 assert!(
3768 scan_err.is_some() || parse_err,
3769 "`\\.` is not a valid double-quoted escape and must error"
3770 );
3771 }
3772
3773 /// YAML 1.2: a complex-key marker (`?`) is the first content after an
3774 /// explicit document start (`---`) — it should open an implicit block
3775 /// mapping. The previous parser handled `?` only in
3776 /// `ImplicitDocumentStart` / `DocumentContent` / already-in-mapping
3777 /// states and errored out for `DocumentStart`, breaking inputs like
3778 /// `--- !!set\n? Mark McGwire\n...`. Tracked by yaml-test-suite 2XXW.
3779 #[test]
3780 fn complex_key_directly_after_explicit_doc_start_opens_mapping() {
3781 use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3782 let mut p = BasicParser::new_eager("--- !!set\n? Mark McGwire\n? Sammy Sosa\n".to_string());
3783 assert!(p.take_scanning_error().is_none());
3784 let mut saw_map_start = false;
3785 let mut saw_error = false;
3786 loop {
3787 match p.get_event() {
3788 Ok(Some(ev)) => {
3789 if matches!(ev.event_type, EventType::MappingStart { .. }) {
3790 saw_map_start = true;
3791 }
3792 }
3793 Ok(None) => break,
3794 Err(_) => {
3795 saw_error = true;
3796 break;
3797 }
3798 }
3799 }
3800 assert!(!saw_error, "complex key after `--- !!set` must not error");
3801 assert!(saw_map_start, "expected a MappingStart event");
3802 }
3803
3804 /// YAML 1.2 §6.9.2: anchor / alias names exclude only whitespace and
3805 /// the flow indicators `,[]{}`. Earlier implementations restricted
3806 /// `scan_identifier` to ASCII alphanumeric / `_` / `-`, which rejected
3807 /// valid unicode anchors like `&😁`. Tracked by yaml-test-suite 8XYN.
3808 #[test]
3809 fn anchor_name_may_contain_unicode_symbols() {
3810 use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3811 let mut p = BasicParser::new_eager("---\n- &😁 unicode anchor\n".to_string());
3812 assert!(
3813 p.take_scanning_error().is_none(),
3814 "unicode anchor must not error"
3815 );
3816 let mut anchors = Vec::new();
3817 while let Ok(Some(ev)) = p.get_event() {
3818 if let EventType::Scalar {
3819 anchor: Some(a), ..
3820 } = ev.event_type
3821 {
3822 anchors.push(a);
3823 }
3824 }
3825 assert_eq!(anchors, vec!["😁"]);
3826 }
3827
3828 /// YAML 1.2 §5.6 / RFC 3986 percent-encoding: tag suffixes may contain
3829 /// `%XX` percent-escaped characters, which must be URI-decoded when
3830 /// resolved. The scanner used to reject `%` in tag suffixes as
3831 /// "Invalid character", so e.g. `!e!tag%21 baz` failed before the
3832 /// resolver got a chance to decode it. Tracked by yaml-test-suite 6CK3.
3833 #[test]
3834 fn tag_suffix_with_percent_escape_resolves_to_decoded_uri() {
3835 use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3836 let mut p = BasicParser::new_eager(
3837 "%TAG !e! tag:example.com,2000:app/\n---\n- !e!tag%21 baz\n".to_string(),
3838 );
3839 assert!(
3840 p.take_scanning_error().is_none(),
3841 "tag percent-escapes must not error"
3842 );
3843 let mut tags = Vec::new();
3844 while let Ok(Some(ev)) = p.get_event() {
3845 if let EventType::Scalar { tag: Some(t), .. } = ev.event_type {
3846 tags.push(t);
3847 }
3848 }
3849 assert_eq!(tags, vec!["tag:example.com,2000:app/tag!"]);
3850 }
3851
3852 /// YAML 1.2 §6.8.4: "A YAML processor should ignore any directive it
3853 /// does not recognize." A `%FOO` reserved directive must NOT be treated
3854 /// as a scan error — the directive line is silently skipped and parsing
3855 /// continues. Tracked by yaml-test-suite test 2LFX.
3856 #[test]
3857 fn reserved_directive_is_ignored_not_an_error() {
3858 use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3859 let mut p = BasicParser::new_eager(
3860 "%FOO bar baz # Should be ignored\n # with a warning.\n---\n\"foo\"\n"
3861 .to_string(),
3862 );
3863 assert!(
3864 p.take_scanning_error().is_none(),
3865 "unknown directives must NOT produce a scan error"
3866 );
3867 let mut scalars = Vec::new();
3868 while let Ok(Some(ev)) = p.get_event() {
3869 if let EventType::Scalar { value, .. } = ev.event_type {
3870 scalars.push(value);
3871 }
3872 }
3873 assert_eq!(scalars, vec!["foo"]);
3874 }
3875
3876 /// Spec requires the two physical lines of `---word1\nword2` to fold into
3877 /// a single plain scalar `"---word1 word2"`. Tracked by yaml-test-suite 82AN.
3878 #[test]
3879 fn three_dashes_followed_by_text_folds_continuation_line() {
3880 let events = parse_with_timeout("---word1\nword2\n").expect("parser hung");
3881 let scalars: Vec<&str> = events
3882 .iter()
3883 .filter_map(|e| match &e.event_type {
3884 crate::parser::EventType::Scalar { value, .. } => Some(value.as_str()),
3885 _ => None,
3886 })
3887 .collect();
3888 assert_eq!(scalars, vec!["---word1 word2"]);
3889 }
3890
3891 /// Regression: tab between block-entry marker and a `-N` value used to
3892 /// hang the scanner via the same `-` match arm. See yaml-test-suite
3893 /// Y79Y/010.
3894 #[test]
3895 fn dash_tab_negative_number_does_not_hang() {
3896 let events = parse_with_timeout("-\t-1\n")
3897 .expect("parser hung — `-\\t-1` should not produce an infinite loop");
3898 assert!(!events.is_empty(), "expected event stream, got none");
3899 }
3900
3901 #[test]
3902 fn test_basic_tokenization() {
3903 let mut scanner = BasicScanner::new("42".to_string());
3904
3905 assert!(scanner.check_token());
3906
3907 // StreamStart
3908 let token = scanner.get_token().unwrap().unwrap();
3909 assert!(matches!(token.token_type, TokenType::StreamStart));
3910
3911 // Number
3912 let token = scanner.get_token().unwrap().unwrap();
3913 if let TokenType::Scalar(value, _) = token.token_type {
3914 assert_eq!(value, "42");
3915 } else {
3916 panic!("Expected scalar token");
3917 }
3918
3919 // StreamEnd
3920 let token = scanner.get_token().unwrap().unwrap();
3921 assert!(matches!(token.token_type, TokenType::StreamEnd));
3922 }
3923
3924 #[test]
3925 fn test_flow_sequence() {
3926 let mut scanner = BasicScanner::new("[1, 2, 3]".to_string());
3927
3928 // StreamStart
3929 scanner.get_token().unwrap();
3930
3931 // [
3932 let token = scanner.get_token().unwrap().unwrap();
3933 assert!(matches!(token.token_type, TokenType::FlowSequenceStart));
3934
3935 // 1
3936 let token = scanner.get_token().unwrap().unwrap();
3937 if let TokenType::Scalar(value, _) = token.token_type {
3938 assert_eq!(value, "1");
3939 }
3940
3941 // ,
3942 let token = scanner.get_token().unwrap().unwrap();
3943 assert!(matches!(token.token_type, TokenType::FlowEntry));
3944 }
3945
3946 #[test]
3947 fn test_quoted_strings() {
3948 let mut scanner = BasicScanner::new(r#""hello world""#.to_string());
3949
3950 // StreamStart
3951 scanner.get_token().unwrap();
3952
3953 // Quoted string
3954 let token = scanner.get_token().unwrap().unwrap();
3955 if let TokenType::Scalar(value, _) = token.token_type {
3956 assert_eq!(value, "hello world");
3957 } else {
3958 panic!("Expected scalar token");
3959 }
3960 }
3961
3962 #[test]
3963 fn test_comment_handling() {
3964 let input = r"
3965# Full line comment
3966key: value # End of line comment
3967# Another comment
3968data: test
3969";
3970 let mut scanner = BasicScanner::new(input.to_string());
3971
3972 let mut tokens = Vec::new();
3973 while let Ok(Some(token)) = scanner.get_token() {
3974 tokens.push(token);
3975 }
3976
3977 // Should only contain YAML structure tokens, no comment tokens
3978 let scalar_values: Vec<String> = tokens
3979 .iter()
3980 .filter_map(|t| match &t.token_type {
3981 TokenType::Scalar(s, _) => Some(s.clone()),
3982 _ => None,
3983 })
3984 .collect();
3985
3986 assert_eq!(scalar_values, vec!["key", "value", "data", "test"]);
3987
3988 // Should not contain any comment tokens
3989 assert!(
3990 !tokens
3991 .iter()
3992 .any(|t| matches!(t.token_type, TokenType::Comment(_)))
3993 );
3994 }
3995
3996 #[test]
3997 fn test_hash_in_strings() {
3998 let input = r#"
3999string1: "This has a # character"
4000string2: 'Also has # character'
4001normal: value # This is a comment
4002"#;
4003 let mut scanner = BasicScanner::new(input.to_string());
4004
4005 let mut scalar_values = Vec::new();
4006 while let Ok(Some(token)) = scanner.get_token() {
4007 if let TokenType::Scalar(value, _) = token.token_type {
4008 scalar_values.push(value);
4009 }
4010 }
4011
4012 assert!(scalar_values.contains(&"This has a # character".to_string()));
4013 assert!(scalar_values.contains(&"Also has # character".to_string()));
4014 assert!(scalar_values.contains(&"value".to_string()));
4015 assert!(
4016 !scalar_values
4017 .iter()
4018 .any(|s| s.contains("This is a comment"))
4019 );
4020 }
4021
4022 #[test]
4023 fn test_escape_sequences() {
4024 // YAML 1.2 §5.7 double-quoted escape sequences. Single-quoted strings
4025 // have NO backslash escapes — `''` is the only escape — so this set
4026 // is restricted to the double-quoted cases.
4027 let test_cases = vec![
4028 (r#""Line 1\nLine 2""#, "Line 1\nLine 2"),
4029 (r#""Col1\tCol2""#, "Col1\tCol2"),
4030 (r#""First\rSecond""#, "First\rSecond"),
4031 (r#""Path\\to\\file""#, "Path\\to\\file"),
4032 (r#""He said \"Hello\"""#, "He said \"Hello\""),
4033 ];
4034
4035 for (input, expected) in test_cases {
4036 let mut scanner = BasicScanner::new(input.to_string());
4037 scanner.get_token().unwrap(); // Skip StreamStart
4038
4039 if let Ok(Some(token)) = scanner.get_token() {
4040 if let TokenType::Scalar(value, _) = token.token_type {
4041 assert_eq!(value, expected, "Failed for input: {}", input);
4042 } else {
4043 panic!("Expected scalar token for input: {}", input);
4044 }
4045 } else {
4046 panic!("Failed to get token for input: {}", input);
4047 }
4048 }
4049 }
4050
4051 #[test]
4052 fn test_extended_yaml_escapes() {
4053 // Test additional YAML escape sequences
4054 let test_cases = vec![
4055 (r#""\0""#, "\0"), // null character
4056 (r#""\a""#, "\x07"), // bell
4057 (r#""\b""#, "\x08"), // backspace
4058 (r#""\f""#, "\x0C"), // form feed
4059 (r#""\v""#, "\x0B"), // vertical tab
4060 (r#""\e""#, "\x1B"), // escape
4061 (r#""\ ""#, " "), // literal space
4062 (r#""\/""#, "/"), // literal forward slash
4063 ];
4064
4065 for (input, expected) in test_cases {
4066 let mut scanner = BasicScanner::new(input.to_string());
4067 scanner.get_token().unwrap(); // Skip StreamStart
4068
4069 if let Ok(Some(token)) = scanner.get_token() {
4070 if let TokenType::Scalar(value, _) = token.token_type {
4071 assert_eq!(value, expected, "Failed for input: {}", input);
4072 } else {
4073 panic!("Expected scalar token for input: {}", input);
4074 }
4075 } else {
4076 panic!("Failed to get token for input: {}", input);
4077 }
4078 }
4079 }
4080
4081 #[test]
4082 fn test_unknown_escape_sequences() {
4083 // YAML 1.2 §5.7: unknown double-quoted escapes are scan errors, not
4084 // preserved literals. (Earlier versions of this scanner kept the
4085 // backslash + char verbatim — see commit history.)
4086 for input in [r#""\z""#, r#""\q""#, r#""\8""#] {
4087 let mut scanner = BasicScanner::new(input.to_string());
4088 scanner.get_token().unwrap(); // StreamStart
4089 assert!(
4090 scanner.get_token().is_err(),
4091 "expected scan error for invalid escape in {input}"
4092 );
4093 }
4094 }
4095}