rust_yaml/scanner/mod.rs
1//! YAML scanner for tokenization
2
3use crate::{Error, Limits, Position, ResourceTracker, Result, error::ErrorContext};
4
5pub mod indentation;
6pub mod scalar_scanner;
7pub mod state;
8pub mod token_processor;
9pub mod tokens;
10// pub mod optimizations; // Temporarily disabled
11pub use scalar_scanner::ScalarScanner;
12pub use tokens::*;
13// pub use optimizations::*;
14
15/// Trait for YAML scanners that convert character streams to tokens
16pub trait Scanner {
17 /// Check if there are more tokens available
18 fn check_token(&self) -> bool;
19
20 /// Peek at the next token without consuming it
21 fn peek_token(&self) -> Result<Option<&Token>>;
22
23 /// Get the next token, consuming it
24 fn get_token(&mut self) -> Result<Option<Token>>;
25
26 /// Reset the scanner state
27 fn reset(&mut self);
28
29 /// Get the current position in the input
30 fn position(&self) -> Position;
31
32 /// Get the input text for error reporting
33 fn input(&self) -> &str;
34}
35
36/// Block-scalar chomping mode per YAML 1.2 §8.1.1.2.
37///
38/// - `Strip` (`-`): drop the final line break and trailing empty lines.
39/// - `Clip` (default): keep exactly one final line break, drop trailing empty lines.
40/// - `Keep` (`+`): preserve the final line break and all trailing empty lines.
41#[derive(Debug, Clone, Copy, PartialEq, Eq)]
42enum ChompingMode {
43 Strip,
44 Clip,
45 Keep,
46}
47
48/// Apply chomping mode to a block-scalar tail.
49///
50/// The collectors emit a `\n` for every line (content or blank). This helper
51/// trims that tail according to spec §8.1.1.2:
52///
53/// - **Strip:** remove every trailing `\n`.
54/// - **Clip:** keep exactly one trailing `\n` if content exists; drop the rest.
55/// Empty input stays empty.
56/// - **Keep:** preserve everything.
57fn apply_chomping(mut s: String, mode: ChompingMode) -> String {
58 match mode {
59 ChompingMode::Keep => s,
60 ChompingMode::Strip => {
61 while s.ends_with('\n') {
62 s.pop();
63 }
64 s
65 }
66 ChompingMode::Clip => {
67 // Strip trailing newlines. If anything remains, restore one.
68 // §8.1.1.2: clip keeps the final line break only when the
69 // scalar has actual content (yaml-test-suite K858: an empty
70 // clip scalar `>` is `""`, not `"\n"`).
71 while s.ends_with('\n') {
72 s.pop();
73 }
74 if !s.is_empty() {
75 s.push('\n');
76 }
77 s
78 }
79 }
80}
81
82/// A basic scanner implementation for YAML tokenization
83#[derive(Debug)]
84#[allow(dead_code)]
85pub struct BasicScanner {
86 input: String,
87 position: Position,
88 current_char: Option<char>,
89 tokens: Vec<Token>,
90 token_index: usize,
91 done: bool,
92 indent_stack: Vec<usize>,
93 current_indent: usize,
94 allow_simple_key: bool,
95 simple_key_allowed: bool,
96 flow_level: usize,
97 preserve_comments: bool,
98 // Indentation style detection
99 detected_indent_style: Option<crate::value::IndentStyle>,
100 indent_samples: Vec<(usize, bool)>, // (size, is_tabs)
101 previous_indent_level: usize, // Track the previous indentation for style detection
102 // Performance optimizations
103 buffer: String, // Reusable string buffer for token values
104 char_cache: Vec<char>, // Cached characters for faster access
105 char_indices: Vec<(usize, char)>, // Cached character indices for O(1) lookups
106 current_char_index: usize, // Current index in char_cache
107 profiler: Option<crate::profiling::YamlProfiler>, // Optional profiling
108 // Error tracking
109 scanning_error: Option<Error>, // Store scanning errors for later retrieval
110 // Resource tracking
111 limits: Limits,
112 resource_tracker: ResourceTracker,
113 // Track inline nested sequences that need closing
114 inline_sequence_depth: usize,
115 // Track compact-notation sequences (where `-` is at the same indent as
116 // the parent mapping keys). These are NOT on indent_stack, so we need
117 // separate tracking to know when to emit BlockEnd for them.
118 compact_sequence_indents: Vec<usize>,
119 // Parallel to indent_stack: true when the entry was pushed by a block
120 // sequence, false when by a mapping. Lets us distinguish "continuing a
121 // regular sequence" from "starting a compact sequence at same indent".
122 indent_is_sequence: Vec<bool>,
123}
124
125impl BasicScanner {
126 /// Create a new scanner from input string
127 pub fn new(input: String) -> Self {
128 Self::with_limits(input, Limits::default())
129 }
130
131 /// Create a new scanner with custom resource limits
132 pub fn with_limits(input: String, limits: Limits) -> Self {
133 let char_cache: Vec<char> = input.chars().collect();
134 let char_indices: Vec<(usize, char)> = input.char_indices().collect();
135 let current_char = char_cache.first().copied();
136
137 // Track document size for resource limits
138 let mut resource_tracker = ResourceTracker::new();
139 if let Err(e) = resource_tracker.add_bytes(&limits, input.len()) {
140 // If the input is too large, create scanner with error state
141 return Self {
142 current_char: None,
143 input,
144 position: Position::start(),
145 tokens: Vec::new(),
146 token_index: 0,
147 done: true,
148 indent_stack: vec![0],
149 current_indent: 0,
150 allow_simple_key: false,
151 simple_key_allowed: false,
152 flow_level: 0,
153 preserve_comments: false,
154 detected_indent_style: None,
155 indent_samples: Vec::new(),
156 previous_indent_level: 0,
157 buffer: String::new(),
158 char_cache: Vec::new(),
159 char_indices: Vec::new(),
160 current_char_index: 0,
161 profiler: None,
162 scanning_error: Some(e),
163 limits,
164 resource_tracker,
165 inline_sequence_depth: 0,
166 compact_sequence_indents: Vec::new(),
167 indent_is_sequence: vec![false],
168 };
169 }
170
171 Self {
172 current_char,
173 input,
174 position: Position::start(),
175 tokens: Vec::new(),
176 token_index: 0,
177 done: false,
178 indent_stack: vec![0], // Always start with base indentation
179 current_indent: 0,
180 allow_simple_key: true,
181 simple_key_allowed: true,
182 flow_level: 0,
183 preserve_comments: false,
184 detected_indent_style: None,
185 indent_samples: Vec::new(),
186 previous_indent_level: 0,
187 buffer: String::with_capacity(64), // Pre-allocate buffer
188 char_cache,
189 char_indices,
190 current_char_index: 0,
191 profiler: std::env::var("RUST_YAML_PROFILE")
192 .ok()
193 .map(|_| crate::profiling::YamlProfiler::new()),
194 scanning_error: None,
195 limits,
196 resource_tracker,
197 inline_sequence_depth: 0,
198 compact_sequence_indents: Vec::new(),
199 indent_is_sequence: vec![false],
200 }
201 }
202
203 /// Create a new scanner with eager token scanning (for compatibility)
204 pub fn new_eager(input: String) -> Self {
205 Self::new_eager_with_limits(input, Limits::default())
206 }
207
208 /// Create a new scanner with eager token scanning and custom limits
209 pub fn new_eager_with_limits(input: String, limits: Limits) -> Self {
210 let mut scanner = Self::with_limits(input, limits);
211 // Store any scanning errors for later retrieval
212 if let Err(error) = scanner.scan_all_tokens() {
213 scanner.scanning_error = Some(error);
214 }
215 scanner
216 }
217
218 /// Create a new scanner with comment preservation enabled
219 pub fn new_with_comments(input: String) -> Self {
220 let mut scanner = Self::new(input);
221 scanner.preserve_comments = true;
222 scanner
223 }
224
225 /// Create a new scanner with comments and custom limits
226 pub fn new_with_comments_and_limits(input: String, limits: Limits) -> Self {
227 let mut scanner = Self::with_limits(input, limits);
228 scanner.preserve_comments = true;
229 scanner
230 }
231
232 /// Create a new scanner with eager scanning and comment preservation
233 pub fn new_eager_with_comments(input: String) -> Self {
234 let mut scanner = Self::new_with_comments(input);
235 scanner.scan_all_tokens().unwrap_or(());
236 scanner
237 }
238
239 /// Get the detected indentation style from the document
240 pub const fn detected_indent_style(&self) -> Option<&crate::value::IndentStyle> {
241 self.detected_indent_style.as_ref()
242 }
243
244 /// Check if there was a scanning error
245 pub const fn has_scanning_error(&self) -> bool {
246 self.scanning_error.is_some()
247 }
248
249 /// Get the scanning error if any
250 #[allow(clippy::missing_const_for_fn)]
251 pub fn take_scanning_error(&mut self) -> Option<Error> {
252 self.scanning_error.take()
253 }
254
255 /// Advance to the next character
256 fn advance(&mut self) -> Option<char> {
257 if let Some(ch) = self.current_char {
258 self.position = self.position.advance(ch);
259 self.current_char_index += 1;
260
261 if self.current_char_index < self.char_cache.len() {
262 self.current_char = Some(self.char_cache[self.current_char_index]);
263 } else {
264 self.current_char = None;
265 }
266 }
267
268 self.current_char
269 }
270
271 /// Skip whitespace characters (excluding newlines)
272 fn skip_whitespace(&mut self) {
273 while let Some(ch) = self.current_char {
274 if ch == ' ' || ch == '\t' {
275 self.advance();
276 } else {
277 break;
278 }
279 }
280 }
281
282 /// Handle indentation and produce block tokens if necessary
283 fn handle_indentation(&mut self) -> Result<()> {
284 // In flow context: if there is a non-trivial enclosing block
285 // (indent_stack has more than the implicit root level), each
286 // continuation line that has content must be indented MORE than
287 // that enclosing block's indent. \`flow: [a,\\nb,c]\` with \`b\`
288 // at col 1 violates this rule because the block mapping enclosing
289 // \`flow:\` sits at indent 0 (yaml-test-suite 9C9N).
290 //
291 // Top-level flow (no enclosing block; indent_stack is just \[0\])
292 // is exempt — `[a,\\nb]` is fine there because the flow content
293 // isn't nested inside any block (yaml-test-suite 4ZYM).
294 if self.flow_level > 0 {
295 if self.indent_stack.len() > 1 || !self.compact_sequence_indents.is_empty() {
296 let mut probe = 0usize;
297 let mut i = self.current_char_index;
298 while i < self.char_cache.len() {
299 match self.char_cache[i] {
300 ' ' => {
301 probe += 1;
302 i += 1;
303 }
304 '\t' => i += 1,
305 _ => break,
306 }
307 }
308 let has_content = self
309 .char_cache
310 .get(i)
311 .map_or(false, |c| !matches!(c, '\n' | '\r'));
312 // A line that begins with the matching flow closer
313 // (\`]\` / \`}\`) is allowed at the parent indent — it
314 // closes the flow collection, not adds content
315 // (yaml-test-suite NKF9 trailing-line \`}\` at col 1).
316 let is_closer = matches!(self.char_cache.get(i).copied(), Some(']' | '}'));
317 if has_content && !is_closer {
318 let parent_indent = self.indent_stack.last().copied().unwrap_or(0);
319 if probe <= parent_indent {
320 return Err(Error::scan(
321 self.position,
322 "Flow content line is not indented enough".to_string(),
323 ));
324 }
325 }
326 }
327 return Ok(());
328 }
329
330 let line_start_pos = self.position;
331 let mut indent = 0;
332 let mut has_tabs = false;
333 let mut has_spaces = false;
334 let _indent_start_pos = self.position;
335
336 // Count indentation and detect style
337 while let Some(ch) = self.current_char {
338 if ch == ' ' {
339 indent += 1;
340 has_spaces = true;
341 self.advance();
342 } else if ch == '\t' {
343 indent += 8; // Tab counts as 8 spaces for indentation calculation
344 has_tabs = true;
345 self.advance();
346 } else {
347 break;
348 }
349 }
350
351 // Analyze indentation pattern for style detection
352 // Only analyze if there's actual content after the indentation (not just whitespace)
353 if indent > 0
354 && self.current_char.is_some()
355 && !matches!(self.current_char, Some('\n' | '\r'))
356 {
357 self.analyze_indentation_pattern(indent, has_tabs, has_spaces)?;
358 }
359
360 // YAML 1.2 §6.1 does NOT require all indents to be multiples
361 // of a single "indent width". Siblings must share a column;
362 // children must indent further; but any positive amount works
363 // (e.g. `key:\n child:\n grandchild:` with widths 2, 1
364 // is legal). The earlier strict-multiple-of-N check rejected
365 // valid spec fixtures like 6HB6, 8G76, A2M4, P94K, Q9WF,
366 // UGM3. We rely on the indent_stack-driven open/close logic
367 // (and the per-block "more than parent" rule enforced
368 // elsewhere) to catch genuine mis-indentation.
369
370 // Update previous indentation level for future comparisons
371 if indent > 0 {
372 self.previous_indent_level = indent;
373 }
374
375 // Update current indentation level
376 self.current_indent = indent;
377
378 // Close compact-notation sequences whose scope ends at this line.
379 // A compact sequence (where `-` shares the indent of the parent
380 // mapping keys) ends when the next content line at that indent is
381 // NOT a block entry (`- `). We must emit the sequence's BlockEnd
382 // BEFORE popping the indent_stack so that the nesting order is
383 // correct (sequence closes before its parent mapping).
384 let has_content =
385 self.current_char.is_some() && !matches!(self.current_char, Some('\n' | '\r' | '#'));
386 if has_content {
387 let is_block_entry = self.current_char == Some('-')
388 && self.peek_char(1).map_or(true, |c| c.is_whitespace());
389 while let Some(&seq_indent) = self.compact_sequence_indents.last() {
390 if indent < seq_indent || (indent == seq_indent && !is_block_entry) {
391 self.compact_sequence_indents.pop();
392 self.tokens
393 .push(Token::simple(TokenType::BlockEnd, line_start_pos));
394 } else {
395 break;
396 }
397 }
398 }
399
400 // Check if we need to emit block end tokens for decreased indentation
401 let pre_pop_top = self.indent_stack.last().copied().unwrap_or(0);
402 while let Some(&last_indent) = self.indent_stack.last() {
403 if indent < last_indent && last_indent > 0 {
404 self.indent_stack.pop();
405 self.indent_is_sequence.pop();
406 self.tokens
407 .push(Token::simple(TokenType::BlockEnd, line_start_pos));
408 } else {
409 break;
410 }
411 }
412
413 // §6.1: after a dedent, the new line's indent must match some
414 // existing container level — keys/items at a sibling level
415 // must share a column. Landing at a column that is between
416 // two stack levels (e.g. parent at 0, just-closed at 3, new
417 // line at 1) is invalid because no open mapping/sequence sits
418 // at indent 1 (yaml-test-suite DMG6, N4JP).
419 //
420 // The check applies only when:
421 // * we actually dedented (pre-pop top was deeper than now),
422 // * the new line has content (the next char is not blank /
423 // newline / EOF / comment),
424 // * indent doesn't match the new top.
425 if pre_pop_top > 0
426 && pre_pop_top > self.indent_stack.last().copied().unwrap_or(0)
427 && self
428 .current_char
429 .map_or(false, |c| !matches!(c, '\n' | '\r' | '#'))
430 && indent != self.indent_stack.last().copied().unwrap_or(0)
431 {
432 // Allow if indent is a valid deeper level — e.g.
433 // sibling at depth then deeper child — but for the
434 // dedent path indent must equal a known stack level.
435 return Err(Error::scan(
436 self.position,
437 format!(
438 "Indentation {indent} doesn't match any open container (expected {} or deeper)",
439 self.indent_stack.last().copied().unwrap_or(0)
440 ),
441 ));
442 }
443
444 Ok(())
445 }
446
447 /// Analyze indentation pattern to detect the document's indentation style
448 fn analyze_indentation_pattern(
449 &mut self,
450 current_indent: usize,
451 has_tabs: bool,
452 has_spaces: bool,
453 ) -> Result<()> {
454 // Prevent mixed indentation (tabs + spaces on same line).
455 // Carve-out: a tab AFTER one or more spaces and BEFORE
456 // value-position content (not a key) is content-area
457 // whitespace, not indentation. \`foo:\\n \\tbar\` — the 1
458 // space is indent, the tab is a separator before \`bar\`
459 // which is the value of \`foo:\` (yaml-test-suite DK95/00).
460 if has_tabs && has_spaces {
461 // Peek ahead: if the content after the tab+spaces area
462 // contains a key marker (`: ` or `:`+EOL), treat as
463 // indentation (invalid). Otherwise it's a value line.
464 let looks_like_key = self.line_after_indent_is_implicit_key();
465 if looks_like_key {
466 let context =
467 crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
468 .with_suggestion(
469 "Use either tabs OR spaces for indentation, not both".to_string(),
470 );
471 return Err(Error::invalid_character_with_context(
472 self.position,
473 '\t',
474 "mixed indentation",
475 context,
476 ));
477 }
478 }
479 // §6.1: indentation must be space characters only. Pure-tab
480 // indentation (\`\\tkey: value\`) is invalid (yaml-test-suite
481 // 4EJS). Two carve-outs:
482 // * The mixed case is caught by the earlier branch.
483 // * Tabs before a flow-collection opener (\`\\t[\`, \`\\t{\`)
484 // at the root are not "block indentation" — there's no
485 // enclosing block — and yaml-test-suite 6CA3 / Q5MG accept
486 // them.
487 if has_tabs && !has_spaces && !matches!(self.current_char, Some('[' | '{')) {
488 let context = crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
489 .with_suggestion("Use space characters for indentation".to_string());
490 return Err(Error::invalid_character_with_context(
491 self.position,
492 '\t',
493 "indentation",
494 context,
495 ));
496 }
497
498 // If we detected tabs, check for mixed indentation across lines
499 if has_tabs {
500 match self.detected_indent_style {
501 None => {
502 // First time detecting indentation style - set to tabs
503 self.detected_indent_style = Some(crate::value::IndentStyle::Tabs);
504 }
505 Some(crate::value::IndentStyle::Spaces(_)) => {
506 // Previously detected spaces, now seeing tabs - mixed indentation error
507 let context =
508 crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
509 .with_suggestion(
510 "Use consistent indentation style throughout the document"
511 .to_string(),
512 );
513 return Err(Error::invalid_character_with_context(
514 self.position,
515 '\t',
516 "mixed indentation",
517 context,
518 ));
519 }
520 Some(crate::value::IndentStyle::Tabs) => {
521 // Already using tabs - this is consistent
522 }
523 }
524 return Ok(());
525 }
526
527 // For spaces, check for mixed indentation across lines first
528 if has_spaces {
529 // Check if we previously detected tabs
530 if matches!(
531 self.detected_indent_style,
532 Some(crate::value::IndentStyle::Tabs)
533 ) {
534 let context =
535 crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
536 .with_suggestion(
537 "Use consistent indentation style throughout the document".to_string(),
538 );
539 return Err(Error::invalid_character_with_context(
540 self.position,
541 ' ',
542 "mixed indentation",
543 context,
544 ));
545 }
546
547 // Calculate the indentation level difference
548 if current_indent > self.previous_indent_level {
549 let indent_diff = current_indent - self.previous_indent_level;
550
551 // Store this sample for analysis (but only meaningful differences)
552 if indent_diff > 0 && indent_diff <= 8 {
553 // Reasonable indentation range
554 self.indent_samples.push((indent_diff, false));
555
556 // Try to determine consistent indentation width
557 if self.detected_indent_style.is_none() {
558 self.detect_space_indentation_width();
559 }
560 }
561 }
562
563 // YAML 1.2 §6.1 does NOT require all indents to be multiples
564 // of a single "indent width". Sibling lines must share a
565 // column and children must indent deeper than parents, but
566 // any positive amount works. The "multiple of N" check
567 // rejected valid spec fixtures (6HB6, M5C3, P94K, Q9WF,
568 // RZP5, UGM3, XW4D, A2M4); we rely on the indent_stack
569 // open/close logic for genuine mis-indentation. The detected
570 // style is still recorded for later style-preservation use
571 // (e.g. emitter), it just no longer drives validation.
572 // self.validate_indentation_consistency(current_indent)?;
573 }
574
575 Ok(())
576 }
577
578 /// Detect the consistent space indentation width from samples
579 fn detect_space_indentation_width(&mut self) {
580 if self.indent_samples.is_empty() {
581 return; // Need at least 1 sample
582 }
583
584 // Find the most common indentation width
585 let mut width_counts = std::collections::HashMap::new();
586
587 for &(width, is_tabs) in &self.indent_samples {
588 if !is_tabs && width > 0 {
589 *width_counts.entry(width).or_insert(0) += 1;
590 }
591 }
592
593 // Find the most frequent width - be more aggressive and detect early
594 if let Some((&most_common_width, &_count)) =
595 width_counts.iter().max_by_key(|&(_, count)| count)
596 {
597 // Set on first consistent sample to enable stricter validation
598 self.detected_indent_style = Some(crate::value::IndentStyle::Spaces(most_common_width));
599 }
600 }
601
602 /// Check if the given indentation level is valid based on current context
603 #[allow(clippy::missing_const_for_fn)] // Cannot be const due to self.detected_indent_style access
604 fn is_valid_indentation_level(&self, indent: usize) -> bool {
605 // For now, allow any indentation that could represent valid nesting
606 // In the future, this could be made more strict by checking against
607 // the current indent_stack to ensure proper nesting
608 if let Some(crate::value::IndentStyle::Spaces(width)) = self.detected_indent_style {
609 // Must be a multiple of the detected width
610 indent % width == 0
611 } else {
612 // If no style detected yet, allow any indentation
613 true
614 }
615 }
616
617 /// Validate that current indentation is consistent with detected style
618 fn validate_indentation_consistency(&self, current_indent: usize) -> Result<()> {
619 if let Some(crate::value::IndentStyle::Spaces(width)) = self.detected_indent_style {
620 // Check if current indentation is a multiple of the detected width
621 if current_indent > 0 && current_indent % width != 0 {
622 let lower_level = (current_indent / width) * width;
623 let higher_level = lower_level + width;
624 let suggestion = format!(
625 "Expected indentation to be a multiple of {} spaces. Use {} or {} spaces instead of {}",
626 width, lower_level, higher_level, current_indent
627 );
628 let context =
629 crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
630 .with_suggestion(suggestion);
631 return Err(Error::indentation_with_context(
632 self.position,
633 (current_indent / width) * width, // expected (nearest valid level)
634 current_indent, // found
635 context,
636 ));
637 }
638 }
639 Ok(())
640 }
641
642 /// Check if current position starts a plain scalar
643 fn is_plain_scalar_start(&self) -> bool {
644 self.current_char.map_or(false, |ch| match ch {
645 // Pure indicators — never start a plain scalar.
646 ',' | '[' | ']' | '{' | '}' | '#' | '&' | '*' | '!' | '|' | '>' | '\'' | '"' | '%'
647 | '@' | '`' => false,
648 // YAML 1.2 §7.3.3: `?`, `:`, `-` may start a plain scalar when
649 // the next character is non-whitespace (and, in flow context,
650 // not a flow indicator). Otherwise they act as indicators
651 // (complex-key marker / value separator / block-entry marker).
652 '?' | ':' | '-' => match self.peek_char(1) {
653 None => false,
654 Some(c) if c.is_whitespace() => false,
655 Some(c) if self.flow_level > 0 && ",[]{}".contains(c) => false,
656 Some(_) => true,
657 },
658 _ => !ch.is_whitespace(),
659 })
660 }
661
662 /// Check if the value is a YAML boolean
663 fn is_yaml_bool(value: &str) -> bool {
664 matches!(
665 value,
666 "true"
667 | "false"
668 | "True"
669 | "False"
670 | "TRUE"
671 | "FALSE"
672 | "yes"
673 | "no"
674 | "Yes"
675 | "No"
676 | "YES"
677 | "NO"
678 | "on"
679 | "off"
680 | "On"
681 | "Off"
682 | "ON"
683 | "OFF"
684 )
685 }
686
687 /// Check if the value is a YAML null
688 fn is_yaml_null(value: &str) -> bool {
689 matches!(value, "null" | "Null" | "NULL" | "~" | "")
690 }
691
692 /// Normalize a scalar value based on YAML rules.
693 ///
694 /// The scanner preserves the original text of plain scalars. Type
695 /// resolution (including version-aware bool/null mapping) happens in
696 /// the composer (see `crate::resolver::resolve_plain_scalar`). This
697 /// preserves enough information for the composer to apply the
698 /// YAML 1.1 vs 1.2 distinction and for round-trip emitters to
699 /// recover the original spelling.
700 fn normalize_scalar(value: String) -> String {
701 value
702 }
703
704 /// Scan a number token
705 fn scan_number(&mut self) -> Result<Token> {
706 let start_pos = self.position;
707 let mut value = String::new();
708
709 // Handle negative numbers
710 if self.current_char == Some('-') {
711 value.push('-');
712 self.advance();
713 }
714
715 // Scan digits
716 while let Some(ch) = self.current_char {
717 if ch.is_ascii_digit() {
718 value.push(ch);
719 self.advance();
720 } else if ch == '.' {
721 value.push(ch);
722 self.advance();
723 // Scan fractional part
724 while let Some(ch) = self.current_char {
725 if ch.is_ascii_digit() {
726 value.push(ch);
727 self.advance();
728 } else {
729 break;
730 }
731 }
732 break;
733 } else {
734 break;
735 }
736 }
737
738 Ok(Token::new(
739 TokenType::Scalar(value, tokens::QuoteStyle::Plain),
740 start_pos,
741 self.position,
742 ))
743 }
744
745 /// Scan a plain scalar (unquoted string)
746 fn scan_plain_scalar(&mut self) -> Result<Token> {
747 let start_pos = self.position;
748 let start_col = start_pos.column;
749 let mut value = String::new();
750 let mut multi_line = false;
751
752 loop {
753 // Scan content on the current line until we hit a stop condition.
754 while let Some(ch) = self.current_char {
755 if self.flow_level == 0 {
756 match ch {
757 '\n' | '\r' => break,
758 ':' if self.peek_char(1).map_or(true, |c| c.is_whitespace()) => break,
759 '#' if value.is_empty()
760 || self.peek_char(-1).map_or(false, |c| c.is_whitespace()) =>
761 {
762 break;
763 }
764 _ => {}
765 }
766 } else {
767 match ch {
768 // Same line-break handling as block context: stop
769 // collecting raw content at `\n`/`\r`, then let the
770 // outer fold logic decide whether the next line
771 // continues this scalar (yaml-test-suite 8KB6,
772 // 8UDB, 9BXH).
773 '\n' | '\r' => break,
774 ',' | '[' | ']' | '{' | '}' => break,
775 // In flow context, `:` is a key-value separator
776 // when followed by whitespace OR any flow indicator
777 // (`,`, `[`, `]`, `{`, `}`). Tracked by yaml-test-
778 // suite FRK4 (`{ ? foo :, ... }`).
779 ':' if self
780 .peek_char(1)
781 .map_or(true, |c| c.is_whitespace() || ",[]{}".contains(c)) =>
782 {
783 break;
784 }
785 '#' if value.is_empty()
786 || self.peek_char(-1).map_or(false, |c| c.is_whitespace()) =>
787 {
788 break;
789 }
790 _ => {}
791 }
792 }
793 value.push(ch);
794 self.advance();
795 }
796
797 // If we didn't stop at a newline, this scalar is complete.
798 if !matches!(self.current_char, Some('\n' | '\r')) {
799 break;
800 }
801
802 // Per §6.5 line folding, trailing whitespace on the line is
803 // dropped (it gets replaced by the fold separator that the
804 // next continuation block emits).
805 while matches!(value.chars().last(), Some(' ' | '\t')) {
806 value.pop();
807 }
808
809 // YAML 1.2 §6.5 / §7.3.3: try to fold continuation lines into
810 // the same plain scalar. A continuation line must be:
811 // * indented strictly more than the scalar's start column,
812 // * not a document marker (`---` / `...`),
813 // * not a comment-only line,
814 // * not empty-with-EOF.
815 // Save state for backtracking if continuation isn't allowed.
816 let saved_position = self.position;
817 let saved_index = self.current_char_index;
818 let saved_char = self.current_char;
819
820 // Count physical newlines we skip; whitespace within the lines
821 // is also consumed.
822 let mut newlines = 0usize;
823 loop {
824 match self.current_char {
825 Some('\n') => {
826 newlines += 1;
827 self.advance();
828 }
829 Some('\r') => {
830 self.advance();
831 }
832 Some(' ' | '\t') => {
833 self.advance();
834 }
835 _ => break,
836 }
837 }
838
839 let next_col = self.position.column;
840 let next_ch = self.current_char;
841 let is_doc_marker = matches!(next_ch, Some('-' | '.'))
842 && self.peek_char(1) == next_ch
843 && self.peek_char(2) == next_ch
844 && self.peek_char(3).map_or(true, |c| c.is_whitespace());
845
846 // Continuation column rule:
847 // * Flow context: no column rule, only flow indicators
848 // terminate (8KB6, 8UDB, 9BXH).
849 // * Block context: must be strictly deeper than the parent
850 // block's key column. The parent indent is the max of
851 // `indent_stack.last()` (block mapping/sequence indent)
852 // and `compact_sequence_indents.last()` — the latter
853 // tracks sequences opened compactly (e.g. `? - x` where
854 // the dash didn't push to indent_stack). Without the
855 // compact-stack check, `? - Detroit Tigers\n - Chicago`
856 // would fold both lines into one scalar (yaml-test-
857 // suite M5DY).
858 // Fall back to `next_col >= start_col` for top-level
859 // scalars where there's no enclosing block.
860 let column_ok = if self.flow_level > 0 {
861 true
862 } else {
863 let block_indent = self.indent_stack.last().copied().unwrap_or(0);
864 let compact_indent = self.compact_sequence_indents.last().copied().unwrap_or(0);
865 let parent_indent = block_indent.max(compact_indent);
866 next_col >= parent_indent + 2 || next_col >= start_col
867 };
868 let can_continue = next_ch.is_some()
869 && !matches!(next_ch, Some('\n' | '\r' | '#'))
870 && column_ok
871 && !is_doc_marker
872 && !(self.flow_level > 0 && matches!(next_ch, Some(',' | ']' | '}')));
873
874 if !can_continue {
875 self.position = saved_position;
876 self.current_char_index = saved_index;
877 self.current_char = saved_char;
878 break;
879 }
880
881 // Append fold separator: single newline → space; N>1 newlines
882 // collapse to N-1 retained newlines (YAML §6.5 line folding).
883 if newlines <= 1 {
884 value.push(' ');
885 } else {
886 for _ in 0..(newlines - 1) {
887 value.push('\n');
888 }
889 }
890 multi_line = true;
891 }
892
893 // YAML 1.2 §8.1.3: implicit keys must be on a single line. If the
894 // plain scalar folded across line breaks AND the next non-
895 // whitespace char is `:` (key-value separator), it's about to be
896 // used as an implicit key — reject (yaml-test-suite G7JE).
897 if multi_line && self.flow_level == 0 {
898 let mut off = 0isize;
899 while matches!(self.peek_char(off), Some(' ' | '\t')) {
900 off += 1;
901 }
902 if self.peek_char(off) == Some(':') {
903 return Err(Error::scan(
904 self.position,
905 "Multi-line plain scalar may not be used as an implicit key".to_string(),
906 ));
907 }
908 }
909
910 self.resource_tracker
911 .check_string_length(&self.limits, value.len())?;
912
913 let value = value.trim_end().to_string();
914 let normalized_value = Self::normalize_scalar(value);
915
916 Ok(Token::new(
917 TokenType::Scalar(normalized_value, tokens::QuoteStyle::Plain),
918 start_pos,
919 self.position,
920 ))
921 }
922
923 /// Scan a quoted string
924 fn scan_quoted_string(&mut self, quote_char: char) -> Result<Token> {
925 let start_pos = self.position;
926 let mut value = String::new();
927
928 // Determine quote style based on quote character
929 let quote_style = match quote_char {
930 '\'' => tokens::QuoteStyle::Single,
931 '"' => tokens::QuoteStyle::Double,
932 _ => tokens::QuoteStyle::Plain,
933 };
934
935 self.advance(); // Skip opening quote
936 let mut closed = false;
937 let mut multi_line = false;
938 // High-water mark of bytes contributed by escape sequences. The
939 // trailing-whitespace strip at fold time must not pop past it,
940 // because an escape-produced \t / space is literal content
941 // (yaml-test-suite DE56/00, DE56/01).
942 let mut escape_end: usize = 0;
943
944 while let Some(ch) = self.current_char {
945 if ch == quote_char {
946 // YAML 1.2 §7.3.2 (Single-Quoted): `''` is the only escape,
947 // collapsing to a single `'`. Detect that here BEFORE
948 // treating the quote as the closing delimiter.
949 if quote_char == '\'' && self.peek_char(1) == Some('\'') {
950 value.push('\'');
951 self.advance();
952 self.advance();
953 continue;
954 }
955 self.advance(); // Skip closing quote
956 closed = true;
957 break;
958 } else if ch == '\\' && quote_char == '"' {
959 self.advance();
960 if let Some(escaped) = self.current_char {
961 match escaped {
962 // YAML 1.2 §5.7 double-quoted escape allowlist.
963 'n' => value.push('\n'),
964 't' => value.push('\t'),
965 'r' => value.push('\r'),
966 '\\' => value.push('\\'),
967 '"' => value.push('"'),
968 '0' => value.push('\0'),
969 'a' => value.push('\x07'),
970 'b' => value.push('\x08'),
971 'f' => value.push('\x0C'),
972 'v' => value.push('\x0B'),
973 'e' => value.push('\x1B'),
974 ' ' => value.push(' '),
975 '/' => value.push('/'),
976 'N' => value.push('\u{0085}'),
977 '_' => value.push('\u{00A0}'),
978 'L' => value.push('\u{2028}'),
979 'P' => value.push('\u{2029}'),
980 '\n' => {
981 // Escaped line break (§7.3.2): the newline is
982 // dropped AND leading whitespace on the next
983 // line is excluded from the content.
984 self.advance();
985 while matches!(self.current_char, Some(' ' | '\t')) {
986 self.advance();
987 }
988 continue;
989 }
990 '\t' => value.push('\t'), // literal tab after `\` → tab (yaml-test-suite 3RLN/DE56)
991 // Hex / Unicode escapes per YAML 1.2 §5.7:
992 // \xNN — 2 hex digits, codepoint ≤ 0xFF
993 // \uNNNN — 4 hex digits, codepoint ≤ 0xFFFF
994 // \UNNNNNNNN — 8 hex digits, full Unicode codepoint
995 'x' | 'u' | 'U' => {
996 let n = match escaped {
997 'x' => 2,
998 'u' => 4,
999 _ => 8,
1000 };
1001 self.advance(); // consume the x/u/U
1002 let mut codepoint: u32 = 0;
1003 for _ in 0..n {
1004 let c = self.current_char.ok_or_else(|| {
1005 Error::scan(
1006 self.position,
1007 format!("Truncated \\{escaped} escape"),
1008 )
1009 })?;
1010 let d = c.to_digit(16).ok_or_else(|| {
1011 Error::scan(
1012 self.position,
1013 format!("Invalid hex digit `{c}` in \\{escaped} escape"),
1014 )
1015 })?;
1016 codepoint = (codepoint << 4) | d;
1017 self.advance();
1018 }
1019 let ch = char::from_u32(codepoint).ok_or_else(|| {
1020 Error::scan(
1021 self.position,
1022 format!("Invalid Unicode codepoint U+{codepoint:X}"),
1023 )
1024 })?;
1025 value.push(ch);
1026 escape_end = value.len();
1027 continue; // already advanced past hex digits
1028 }
1029 // Everything else is invalid per spec.
1030 _ => {
1031 return Err(Error::scan(
1032 self.position,
1033 format!("Invalid escape sequence: \\{escaped}"),
1034 ));
1035 }
1036 }
1037 escape_end = value.len();
1038 self.advance();
1039 }
1040 } else if ch == '\\' {
1041 // Single-quoted strings have no backslash escapes — `\` is
1042 // a literal character. (Single-quote escape is `''`.)
1043 value.push(ch);
1044 self.advance();
1045 } else if ch == '\n' || ch == '\r' {
1046 // YAML 1.2 §7.3.2 (double-quoted) / §7.3.3 (single-quoted)
1047 // line folding: a single newline within a quoted scalar
1048 // folds to a space; N>1 consecutive newlines retain N-1;
1049 // leading whitespace on the continuation line is excluded.
1050 let mut newlines = 0usize;
1051 // §6.1: tabs cannot be indentation. A continuation
1052 // line that BEGINS with a tab (no leading spaces) in
1053 // an enclosing block context is invalid (yaml-test-
1054 // suite DK95/01). Tabs that appear AFTER spaces in
1055 // the same indent area are content, not indentation.
1056 let mut just_after_newline = false;
1057 while let Some(c) = self.current_char {
1058 match c {
1059 '\n' => {
1060 newlines += 1;
1061 multi_line = true;
1062 self.advance();
1063 just_after_newline = true;
1064 }
1065 '\r' => {
1066 self.advance();
1067 }
1068 ' ' => {
1069 self.advance();
1070 just_after_newline = false;
1071 }
1072 '\t' if just_after_newline
1073 && self.flow_level == 0
1074 && (self.indent_stack.len() > 1
1075 || !self.compact_sequence_indents.is_empty()) =>
1076 {
1077 return Err(Error::scan(
1078 self.position,
1079 "Tab cannot serve as indentation of quoted scalar continuation"
1080 .to_string(),
1081 ));
1082 }
1083 '\t' => {
1084 self.advance();
1085 }
1086 _ => break,
1087 }
1088 }
1089 // §8.1.4: a multi-line quoted scalar inside a block
1090 // context must indent each continuation more than the
1091 // enclosing block. \`quoted: "a\\nb"\` with \`b\` at col 1
1092 // violates the rule because \`quoted:\` sits at indent 0
1093 // (yaml-test-suite QB6E). Only fires when there IS an
1094 // enclosing block (indent_stack > [0] or compact-seq
1095 // active) — top-level quoted scalars with continuation
1096 // at col 1 are legal.
1097 if newlines > 0
1098 && self.flow_level == 0
1099 && (self.indent_stack.len() > 1 || !self.compact_sequence_indents.is_empty())
1100 && !matches!(self.current_char, None | Some('\n' | '\r'))
1101 {
1102 let parent_indent = self.indent_stack.last().copied().unwrap_or(0);
1103 let indent = self.position.column.saturating_sub(1);
1104 if indent <= parent_indent {
1105 return Err(Error::scan(
1106 self.position,
1107 "Quoted scalar continuation line is not indented enough".to_string(),
1108 ));
1109 }
1110 }
1111 // §6.8: a doc-start/end marker (`---` or `...`) at
1112 // column 1 always terminates the current document.
1113 // Encountering one inside an unterminated quoted
1114 // scalar is invalid — the quote escapes nothing past
1115 // the doc boundary (yaml-test-suite 5TRB, RXY3,
1116 // 9MQT/01).
1117 if self.position.column == 1 {
1118 let next3: String = self
1119 .char_cache
1120 .get(self.current_char_index..self.current_char_index + 3)
1121 .map(|s| s.iter().collect())
1122 .unwrap_or_default();
1123 if (next3 == "---" || next3 == "...")
1124 && self
1125 .char_cache
1126 .get(self.current_char_index + 3)
1127 .map_or(true, |c| c.is_whitespace())
1128 {
1129 return Err(Error::scan(
1130 self.position,
1131 format!(
1132 "Document {} marker `{}` inside quoted scalar",
1133 if next3 == "---" { "start" } else { "end" },
1134 next3
1135 ),
1136 ));
1137 }
1138 }
1139 // Drop trailing whitespace on the prior line (the bytes
1140 // we already pushed) before applying the fold. Don't
1141 // strip past `escape_end` — escape-produced whitespace
1142 // is literal content, not "trailing" line whitespace.
1143 while value.len() > escape_end && matches!(value.chars().last(), Some(' ' | '\t')) {
1144 value.pop();
1145 }
1146 if newlines <= 1 {
1147 value.push(' ');
1148 } else {
1149 for _ in 0..(newlines - 1) {
1150 value.push('\n');
1151 }
1152 }
1153 } else {
1154 value.push(ch);
1155 self.advance();
1156
1157 // Check string length periodically to fail fast
1158 if value.len() > self.limits.max_string_length {
1159 return Err(Error::limit_exceeded(format!(
1160 "String length {} exceeds maximum {}",
1161 value.len(),
1162 self.limits.max_string_length
1163 )));
1164 }
1165 }
1166 }
1167
1168 // Check string length limit
1169 if !closed {
1170 return Err(Error::scan(
1171 self.position,
1172 format!(
1173 "Unclosed {} quoted string",
1174 if quote_char == '"' {
1175 "double"
1176 } else {
1177 "single"
1178 }
1179 ),
1180 ));
1181 }
1182
1183 self.resource_tracker
1184 .check_string_length(&self.limits, value.len())?;
1185
1186 // YAML 1.2 §7.3.1 / §7.3.2: after the closing quote, the rest of
1187 // the line (or sub-expression in flow context) must be empty save
1188 // for a separator. Skip horizontal whitespace and look at the next
1189 // non-space char; if it's content rather than `,`/`:`/`}`/`]`/`#`/
1190 // newline/EOF, it's a trailing-content error (yaml-test-suite
1191 // Q4CL: `"quoted2" trailing content`).
1192 {
1193 let mut offset = 0isize;
1194 let mut saw_space = false;
1195 while matches!(self.peek_char(offset), Some(' ' | '\t')) {
1196 saw_space = true;
1197 offset += 1;
1198 }
1199 let next = self.peek_char(offset);
1200 // A `#` is a comment indicator ONLY when preceded by whitespace
1201 // (YAML 1.2 §6.6); `"value"#cmt` is invalid.
1202 let ok = match next {
1203 None => true,
1204 Some('#') => saw_space,
1205 Some(c) => matches!(c, ',' | ':' | '}' | ']' | '\n' | '\r'),
1206 };
1207 if !ok {
1208 return Err(Error::scan(
1209 self.position,
1210 format!("Unexpected `{}` after quoted scalar", next.unwrap_or(' ')),
1211 ));
1212 }
1213 // YAML 1.2 §8.1.3: implicit keys must be on a single line.
1214 // If the scalar folded across line breaks AND the next non-
1215 // whitespace char is `:` (key-value separator), the scalar
1216 // is being used as an implicit key — error.
1217 if multi_line && self.flow_level == 0 && next == Some(':') {
1218 return Err(Error::scan(
1219 self.position,
1220 "Multi-line quoted scalar may not be used as an implicit key".to_string(),
1221 ));
1222 }
1223 }
1224
1225 Ok(Token::new(
1226 TokenType::Scalar(value, quote_style),
1227 start_pos,
1228 self.position,
1229 ))
1230 }
1231
1232 /// Scan document start marker (---)
1233 fn scan_document_start(&mut self) -> Result<Option<Token>> {
1234 if self.current_char == Some('-')
1235 && self.peek_char(1) == Some('-')
1236 && self.peek_char(2) == Some('-')
1237 && self.peek_char(3).map_or(true, |c| c.is_whitespace())
1238 {
1239 // Doc markers are invalid inside flow collections.
1240 if self.flow_level > 0 {
1241 return Err(Error::scan(
1242 self.position,
1243 "`---` document-start marker is not allowed inside a flow collection"
1244 .to_string(),
1245 ));
1246 }
1247 let start_pos = self.position;
1248 self.advance(); // -
1249 self.advance(); // -
1250 self.advance(); // -
1251
1252 Ok(Some(Token::new(
1253 TokenType::DocumentStart,
1254 start_pos,
1255 self.position,
1256 )))
1257 } else {
1258 Ok(None)
1259 }
1260 }
1261
1262 /// Scan YAML version directive (%YAML)
1263 fn scan_yaml_directive(&mut self) -> Result<Option<Token>> {
1264 if self.current_char != Some('%') {
1265 return Ok(None);
1266 }
1267
1268 let start_pos = self.position;
1269 let saved_position = self.position;
1270 self.advance(); // Skip '%'
1271
1272 // Check for "YAML"
1273 if self.current_char == Some('Y')
1274 && self.peek_char(1) == Some('A')
1275 && self.peek_char(2) == Some('M')
1276 && self.peek_char(3) == Some('L')
1277 && self.peek_char(4).map_or(false, |c| c.is_whitespace())
1278 {
1279 self.advance(); // Y
1280 self.advance(); // A
1281 self.advance(); // M
1282 self.advance(); // L
1283
1284 // Skip whitespace
1285 self.skip_whitespace();
1286
1287 // Parse version number (e.g., "1.2")
1288 let major = if let Some(ch) = self.current_char {
1289 if ch.is_ascii_digit() {
1290 let digit = ch.to_digit(10).unwrap() as u8;
1291 self.advance();
1292 digit
1293 } else {
1294 return Err(Error::scan(
1295 self.position,
1296 "Expected major version number after %YAML".to_string(),
1297 ));
1298 }
1299 } else {
1300 return Err(Error::scan(
1301 self.position,
1302 "Expected version after %YAML directive".to_string(),
1303 ));
1304 };
1305
1306 // Expect '.'
1307 if self.current_char != Some('.') {
1308 return Err(Error::scan(
1309 self.position,
1310 "Expected '.' in YAML version".to_string(),
1311 ));
1312 }
1313 self.advance();
1314
1315 // Parse minor version
1316 let minor = if let Some(ch) = self.current_char {
1317 if ch.is_ascii_digit() {
1318 let digit = ch.to_digit(10).unwrap() as u8;
1319 self.advance();
1320 digit
1321 } else {
1322 return Err(Error::scan(
1323 self.position,
1324 "Expected minor version number after '.'".to_string(),
1325 ));
1326 }
1327 } else {
1328 return Err(Error::scan(
1329 self.position,
1330 "Expected minor version number".to_string(),
1331 ));
1332 };
1333
1334 // YAML 1.2 §6.8.1: the directive line must end after the
1335 // version (modulo whitespace and an optional comment). Extra
1336 // tokens (e.g. `%YAML 1.2 foo`) are invalid — yaml-test-suite
1337 // H7TQ. Also `%YAML 1.1#...` (yaml-test-suite MUS6/00) needs
1338 // whitespace before `#`.
1339 let mut saw_space = false;
1340 while matches!(self.current_char, Some(' ' | '\t')) {
1341 saw_space = true;
1342 self.advance();
1343 }
1344 match self.current_char {
1345 None | Some('\n' | '\r') => {}
1346 Some('#') if saw_space => {
1347 while let Some(ch) = self.current_char {
1348 if ch == '\n' || ch == '\r' {
1349 break;
1350 }
1351 self.advance();
1352 }
1353 }
1354 Some(c) => {
1355 return Err(Error::scan(
1356 self.position,
1357 format!("Unexpected `{c}` after %YAML directive"),
1358 ));
1359 }
1360 }
1361
1362 Ok(Some(Token::new(
1363 TokenType::YamlDirective(major, minor),
1364 start_pos,
1365 self.position,
1366 )))
1367 } else {
1368 // Not a YAML directive, reset position
1369 self.position = saved_position;
1370 // Properly reset current_char based on saved position
1371 self.current_char = self
1372 .char_indices
1373 .iter()
1374 .find(|(i, _)| *i == saved_position.index)
1375 .map(|(_, ch)| *ch);
1376 // Reset the current_char_index
1377 self.current_char_index = self
1378 .char_indices
1379 .iter()
1380 .position(|(i, _)| *i == saved_position.index)
1381 .unwrap_or(0);
1382 Ok(None)
1383 }
1384 }
1385
1386 /// Scan TAG directive (%TAG)
1387 fn scan_tag_directive(&mut self) -> Result<Option<Token>> {
1388 if self.current_char != Some('%') {
1389 return Ok(None);
1390 }
1391
1392 let start_pos = self.position;
1393 let saved_position = self.position;
1394 self.advance(); // Skip '%'
1395
1396 // Check for "TAG"
1397 if self.current_char == Some('T')
1398 && self.peek_char(1) == Some('A')
1399 && self.peek_char(2) == Some('G')
1400 && self.peek_char(3).map_or(false, |c| c.is_whitespace())
1401 {
1402 self.advance(); // T
1403 self.advance(); // A
1404 self.advance(); // G
1405
1406 // Skip whitespace
1407 self.skip_whitespace();
1408
1409 // Parse handle (e.g., "!" or "!!")
1410 let handle = self.scan_tag_handle()?;
1411
1412 // Skip whitespace
1413 self.skip_whitespace();
1414
1415 // Parse prefix (URI)
1416 let prefix = self.scan_tag_prefix()?;
1417
1418 Ok(Some(Token::new(
1419 TokenType::TagDirective(handle, prefix),
1420 start_pos,
1421 self.position,
1422 )))
1423 } else {
1424 // Reset position if not a TAG directive
1425 self.position = saved_position;
1426 // Properly reset current_char based on saved position
1427 self.current_char = self
1428 .char_indices
1429 .iter()
1430 .find(|(i, _)| *i == saved_position.index)
1431 .map(|(_, ch)| *ch);
1432 // Reset the current_char_index
1433 self.current_char_index = self
1434 .char_indices
1435 .iter()
1436 .position(|(i, _)| *i == saved_position.index)
1437 .unwrap_or(0);
1438 Ok(None)
1439 }
1440 }
1441
1442 /// Scan a tag handle for TAG directive
1443 fn scan_tag_handle(&mut self) -> Result<String> {
1444 let mut handle = String::new();
1445
1446 if self.current_char != Some('!') {
1447 return Err(Error::scan(
1448 self.position,
1449 "Expected '!' at start of tag handle".to_string(),
1450 ));
1451 }
1452
1453 handle.push('!');
1454 self.advance();
1455
1456 // Handle can be "!" or "!!" or "!name!"
1457 if self.current_char == Some('!') {
1458 // Secondary handle "!!"
1459 handle.push('!');
1460 self.advance();
1461 } else if self.current_char.map_or(false, |c| c.is_alphanumeric()) {
1462 // Named handle like "!name!"
1463 while let Some(ch) = self.current_char {
1464 if ch.is_alphanumeric() || ch == '-' || ch == '_' {
1465 handle.push(ch);
1466 self.advance();
1467 } else if ch == '!' {
1468 handle.push(ch);
1469 self.advance();
1470 break;
1471 } else {
1472 break;
1473 }
1474 }
1475 }
1476 // else just "!" primary handle
1477
1478 Ok(handle)
1479 }
1480
1481 /// Scan a tag prefix (URI) for TAG directive
1482 fn scan_tag_prefix(&mut self) -> Result<String> {
1483 let mut prefix = String::new();
1484
1485 // Read until end of line or comment
1486 while let Some(ch) = self.current_char {
1487 if ch == '\n' || ch == '\r' || ch == '#' {
1488 break;
1489 }
1490 if ch.is_whitespace() && prefix.is_empty() {
1491 self.advance();
1492 continue;
1493 }
1494 if ch.is_whitespace() && !prefix.is_empty() {
1495 // Trailing whitespace, we're done
1496 break;
1497 }
1498 prefix.push(ch);
1499 self.advance();
1500 }
1501
1502 if prefix.is_empty() {
1503 return Err(Error::scan(
1504 self.position,
1505 "Expected tag prefix after tag handle".to_string(),
1506 ));
1507 }
1508
1509 Ok(prefix.trim().to_string())
1510 }
1511
1512 /// Check if current position might be a directive
1513 fn is_directive(&self) -> bool {
1514 self.current_char == Some('%') && self.position.column == 1
1515 }
1516
1517 /// Scan document end marker (...)
1518 fn scan_document_end(&mut self) -> Result<Option<Token>> {
1519 if self.current_char == Some('.')
1520 && self.peek_char(1) == Some('.')
1521 && self.peek_char(2) == Some('.')
1522 && self.peek_char(3).map_or(true, |c| c.is_whitespace())
1523 {
1524 // Doc markers are invalid inside flow collections.
1525 if self.flow_level > 0 {
1526 return Err(Error::scan(
1527 self.position,
1528 "`...` document-end marker is not allowed inside a flow collection".to_string(),
1529 ));
1530 }
1531 let start_pos = self.position;
1532 self.advance(); // .
1533 self.advance(); // .
1534 self.advance(); // .
1535
1536 // YAML 1.2 §6.4: `...` must be followed only by whitespace or
1537 // end-of-line (comments allowed). Inline content after `...`
1538 // is invalid (yaml-test-suite 3HFZ).
1539 while let Some(ch) = self.current_char {
1540 match ch {
1541 ' ' | '\t' => {
1542 self.advance();
1543 }
1544 '\n' | '\r' | '#' => break,
1545 _ => {
1546 return Err(Error::scan(
1547 self.position,
1548 "Content after `...` document-end marker is invalid".to_string(),
1549 ));
1550 }
1551 }
1552 }
1553
1554 Ok(Some(Token::new(
1555 TokenType::DocumentEnd,
1556 start_pos,
1557 self.position,
1558 )))
1559 } else {
1560 Ok(None)
1561 }
1562 }
1563
1564 /// Scan a comment token
1565 fn scan_comment(&mut self) -> Result<Token> {
1566 let start_pos = self.position;
1567 let mut comment_text = String::new();
1568
1569 // Skip the '#' character
1570 if self.current_char == Some('#') {
1571 self.advance();
1572 }
1573
1574 // Collect the comment text
1575 while let Some(ch) = self.current_char {
1576 if ch == '\n' || ch == '\r' {
1577 break;
1578 }
1579 comment_text.push(ch);
1580 self.advance();
1581 }
1582
1583 // Trim leading whitespace from comment text
1584 let comment_text = comment_text.trim_start().to_string();
1585
1586 Ok(Token::new(
1587 TokenType::Comment(comment_text),
1588 start_pos,
1589 self.position,
1590 ))
1591 }
1592
1593 /// Process a line and generate appropriate tokens
1594 #[allow(clippy::cognitive_complexity)]
1595 fn process_line(&mut self) -> Result<()> {
1596 // Check for directives at start of line
1597 if self.position.column == 1 && self.current_char == Some('%') {
1598 // Try to scan YAML directive
1599 if let Some(token) = self.scan_yaml_directive()? {
1600 self.tokens.push(token);
1601 return Ok(());
1602 }
1603
1604 // Try to scan TAG directive
1605 if let Some(token) = self.scan_tag_directive()? {
1606 self.tokens.push(token);
1607 return Ok(());
1608 }
1609
1610 // YAML 1.2 §6.8.4: a YAML processor MUST ignore directives it
1611 // does not recognize. Skip the line silently — parsing continues
1612 // with whatever follows on the next line.
1613 if self.current_char == Some('%') {
1614 while let Some(ch) = self.current_char {
1615 if ch == '\n' || ch == '\r' {
1616 break;
1617 }
1618 self.advance();
1619 }
1620 return Ok(());
1621 }
1622 }
1623
1624 // Check for document markers at start of line
1625 if self.position.column == 1 {
1626 // Check for document start marker
1627 if let Some(token) = self.scan_document_start()? {
1628 self.tokens.push(token);
1629 return Ok(());
1630 }
1631
1632 // Check for document end marker
1633 if let Some(token) = self.scan_document_end()? {
1634 self.tokens.push(token);
1635 return Ok(());
1636 }
1637 }
1638
1639 // Handle indentation at start of line
1640 if self.position.column == 1 {
1641 self.handle_indentation()?;
1642 }
1643
1644 // Skip empty lines and comments
1645 self.skip_whitespace();
1646
1647 match self.current_char {
1648 None => return Ok(()),
1649 Some('#') => {
1650 if self.preserve_comments {
1651 // Create a comment token
1652 let comment_token = self.scan_comment()?;
1653 self.tokens.push(comment_token);
1654 } else {
1655 // Skip comment lines
1656 while let Some(ch) = self.current_char {
1657 if ch == '\n' || ch == '\r' {
1658 break;
1659 }
1660 self.advance();
1661 }
1662 }
1663 return Ok(());
1664 }
1665 Some('\n' | '\r') => {
1666 self.advance();
1667 return Ok(());
1668 }
1669 _ => {}
1670 }
1671
1672 // Process tokens on this line
1673 while let Some(ch) = self.current_char {
1674 match ch {
1675 '\n' | '\r' => break,
1676 ' ' | '\t' => {
1677 self.skip_whitespace();
1678 }
1679 '#' => {
1680 // YAML 1.2 §6.6: a comment must be preceded by whitespace
1681 // OR be at the start of a line. Inputs like `,#invalid`
1682 // (yaml-test-suite CVW2) are not valid comments.
1683 let prev = self.peek_char(-1);
1684 let at_line_start = self.position.column == 1;
1685 let preceded_by_space = prev.map_or(true, |c| c.is_whitespace());
1686 if !at_line_start && !preceded_by_space {
1687 return Err(Error::scan(
1688 self.position,
1689 "Comment `#` must be preceded by whitespace".to_string(),
1690 ));
1691 }
1692 if self.preserve_comments {
1693 let comment_token = self.scan_comment()?;
1694 self.tokens.push(comment_token);
1695 } else {
1696 while let Some(ch) = self.current_char {
1697 if ch == '\n' || ch == '\r' {
1698 break;
1699 }
1700 self.advance();
1701 }
1702 }
1703 break;
1704 }
1705
1706 // Flow indicators. §7.4 allows a flow collection as
1707 // the implicit key of a block mapping (`[a]: b`,
1708 // `{x: y}: z`). When the flow-open is at line-start
1709 // (block context) and a `:` follows on the same line,
1710 // open the wrapping block mapping at the column of the
1711 // flow-open token, just as we do for line-start
1712 // properties (yaml-test-suite LX3P, 4FJ6, M2N8/01).
1713 '[' => {
1714 if self.flow_level == 0
1715 && self.position.column == self.current_indent + 1
1716 && self.check_for_mapping_ahead()
1717 {
1718 self.maybe_open_block_mapping_for_key()?;
1719 }
1720 let pos = self.position;
1721 self.advance();
1722 self.flow_level += 1;
1723 // Check depth limit
1724 self.resource_tracker
1725 .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
1726 self.tokens
1727 .push(Token::new(TokenType::FlowSequenceStart, pos, self.position));
1728 }
1729 ']' => {
1730 // YAML 1.2 §7.4: `]` is only valid inside an open
1731 // flow sequence. Stray `]` is a syntax error
1732 // (yaml-test-suite 4H7K).
1733 if self.flow_level == 0 {
1734 let context = ErrorContext::from_input(&self.input, &self.position, 2)
1735 .with_suggestion(
1736 "Remove the extra `]` or open a flow sequence with `[` first"
1737 .to_string(),
1738 );
1739 return Err(Error::scan_with_context(
1740 self.position,
1741 "Unexpected `]` outside flow context",
1742 context,
1743 ));
1744 }
1745 let pos = self.position;
1746 self.advance();
1747 self.flow_level -= 1;
1748 self.tokens
1749 .push(Token::new(TokenType::FlowSequenceEnd, pos, self.position));
1750 }
1751 '{' => {
1752 if self.flow_level == 0
1753 && self.position.column == self.current_indent + 1
1754 && self.check_for_mapping_ahead()
1755 {
1756 self.maybe_open_block_mapping_for_key()?;
1757 }
1758 let pos = self.position;
1759 self.advance();
1760 self.flow_level += 1;
1761 // Check depth limit
1762 self.resource_tracker
1763 .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
1764 self.tokens
1765 .push(Token::new(TokenType::FlowMappingStart, pos, self.position));
1766 }
1767 '}' => {
1768 if self.flow_level == 0 {
1769 let context = ErrorContext::from_input(&self.input, &self.position, 2)
1770 .with_suggestion(
1771 "Remove the extra `}` or open a flow mapping with `{` first"
1772 .to_string(),
1773 );
1774 return Err(Error::scan_with_context(
1775 self.position,
1776 "Unexpected `}` outside flow context",
1777 context,
1778 ));
1779 }
1780 let pos = self.position;
1781 self.advance();
1782 self.flow_level -= 1;
1783 self.tokens
1784 .push(Token::new(TokenType::FlowMappingEnd, pos, self.position));
1785 }
1786 ',' => {
1787 // §7.4: \`,\` is a flow indicator. Outside flow
1788 // context it's not meaningful as a structural
1789 // separator (yaml-test-suite U99R: \`- !!str, xxx\`
1790 // — the comma after a tag in block context is
1791 // invalid).
1792 if self.flow_level == 0 {
1793 return Err(Error::scan(
1794 self.position,
1795 "Unexpected `,` outside flow context".to_string(),
1796 ));
1797 }
1798 let pos = self.position;
1799 self.advance();
1800 self.tokens
1801 .push(Token::new(TokenType::FlowEntry, pos, self.position));
1802 }
1803
1804 // Key-value separator. YAML 1.2 §7.3.3 / §7.4:
1805 // * Block context: `:` separates key from value only when
1806 // followed by whitespace / EOF — otherwise it's part of
1807 // a plain scalar (e.g. `:foo`, `URL://path`).
1808 // * Flow context: same, plus `:` may be adjacent to a
1809 // value when the previous token completed a key node
1810 // (quoted/plain scalar, alias, or closed flow
1811 // collection) — see yaml-test-suite 5MUD, 5T43.
1812 ':' if self.peek_char(1).map_or(true, |c| {
1813 c.is_whitespace() || (self.flow_level > 0 && ",[]{}".contains(c))
1814 }) || (self.flow_level > 0
1815 && matches!(
1816 self.tokens.last().map(|t| &t.token_type),
1817 Some(
1818 TokenType::Scalar(_, _)
1819 | TokenType::Alias(_)
1820 | TokenType::FlowMappingEnd
1821 | TokenType::FlowSequenceEnd
1822 )
1823 )) =>
1824 {
1825 // §6.2: a \`:\` at line-start (the explicit-value
1826 // counterpart of an explicit \`?\` key) must be
1827 // followed by a SPACE — a tab as separator is
1828 // invalid (yaml-test-suite Y79Y/007, /009).
1829 if self.flow_level == 0
1830 && self.position.column == self.current_indent + 1
1831 && self.peek_char(1) == Some('\t')
1832 {
1833 return Err(Error::scan(
1834 self.position,
1835 "Tab cannot follow line-start `:` as explicit-value separator"
1836 .to_string(),
1837 ));
1838 }
1839 // §8.22: an implicit key in block context must fit
1840 // on a single line. If the previous token is a
1841 // flow-collection close whose matching open is on
1842 // a different line, the flow node spans multiple
1843 // lines and can't serve as the key (yaml-test-
1844 // suite C2SP \`[23\\n]: 42\`).
1845 if self.flow_level == 0 {
1846 let mut is_flow_close = false;
1847 let mut close_end_line = 0;
1848 if let Some(last) = self.tokens.last() {
1849 if matches!(
1850 last.token_type,
1851 TokenType::FlowSequenceEnd | TokenType::FlowMappingEnd
1852 ) {
1853 is_flow_close = true;
1854 close_end_line = last.end_position.line;
1855 }
1856 }
1857 if is_flow_close {
1858 let mut depth = 0i32;
1859 let mut open_idx: Option<usize> = None;
1860 for (idx, t) in self.tokens.iter().enumerate().rev() {
1861 match &t.token_type {
1862 TokenType::FlowSequenceEnd | TokenType::FlowMappingEnd => {
1863 depth += 1;
1864 }
1865 TokenType::FlowSequenceStart | TokenType::FlowMappingStart => {
1866 depth -= 1;
1867 if depth == 0 {
1868 open_idx = Some(idx);
1869 break;
1870 }
1871 }
1872 _ => {}
1873 }
1874 }
1875 if let Some(oi) = open_idx {
1876 let open_line = self.tokens[oi].start_position.line;
1877 // If a `?` (Key) token precedes the
1878 // matching flow open on the same line
1879 // as the key, the key is explicit and
1880 // may span lines (yaml-test-suite M5DY
1881 // \`? [ ...spans... ]: [ ... ]\`).
1882 let key_marker_before = self.tokens[..oi].iter().rev().any(|t| {
1883 matches!(t.token_type, TokenType::Key)
1884 && t.start_position.line == open_line
1885 });
1886 if !key_marker_before && open_line != close_end_line {
1887 return Err(Error::scan(
1888 self.position,
1889 "Implicit key in block context: flow collection key spans multiple lines"
1890 .to_string(),
1891 ));
1892 }
1893 }
1894 }
1895 }
1896 let pos = self.position;
1897 self.advance();
1898 self.tokens
1899 .push(Token::new(TokenType::Value, pos, self.position));
1900 }
1901
1902 // §6.2: the explicit-key marker \`?\` must be followed
1903 // by a SPACE (or EOL), not a tab. Tab as separator
1904 // after \`?\` is invalid (yaml-test-suite Y79Y/006, /008).
1905 '?' if self.flow_level == 0 && self.peek_char(1) == Some('\t') => {
1906 return Err(Error::scan(
1907 self.position,
1908 "Tab cannot follow `?` as block-key separator".to_string(),
1909 ));
1910 }
1911
1912 // Explicit key marker. An indented `?` at line-start
1913 // (e.g. `mapping:\\n ? key`) opens an implicit block
1914 // mapping at this column — same as a line-start scalar
1915 // key. Without this, scan_plain_scalar wouldn't see
1916 // the inner mapping's indent and would wrongly fold
1917 // the key content into a multi-line scalar
1918 // (yaml-test-suite S9E8, KK5P).
1919 '?' if self.flow_level == 0
1920 && (self.peek_char(1).map_or(true, |c| c.is_whitespace())
1921 || self.peek_char(1).is_none()) =>
1922 {
1923 if self.position.column == self.current_indent + 1 {
1924 self.maybe_open_block_mapping_for_key()?;
1925 }
1926 let pos = self.position;
1927 self.advance();
1928 self.tokens
1929 .push(Token::new(TokenType::Key, pos, self.position));
1930 }
1931 '?' if self.flow_level > 0
1932 && (self
1933 .peek_char(1)
1934 .map_or(true, |c| c.is_whitespace() || ",:]}".contains(c))
1935 || self.peek_char(1).is_none()) =>
1936 {
1937 let pos = self.position;
1938 self.advance();
1939 self.tokens
1940 .push(Token::new(TokenType::Key, pos, self.position));
1941 }
1942
1943 // Block entry
1944 '-' if self.flow_level == 0
1945 && (self.peek_char(1).map_or(true, |c| c.is_whitespace())
1946 || self.peek_char(1).is_none()) =>
1947 {
1948 // A block-entry \`-\` immediately after a flow
1949 // collection's close (\`}\`, \`]\`) ON THE SAME LINE
1950 // is invalid — no separator between the closed
1951 // flow node and the next sibling (yaml-test-suite
1952 // P2EQ \`- { y: z }- invalid\`). The same-line guard
1953 // is essential — a \`}\` on a previous line with a
1954 // new \`-\` on the next line is perfectly valid.
1955 //
1956 // Likewise, a block-entry \`-\` immediately after a
1957 // property (Anchor / Tag) on the same line is
1958 // invalid — the property must precede a node, and
1959 // a block sequence's first \`-\` must begin a line
1960 // (yaml-test-suite SY6V \`&anchor - x\`).
1961 if let Some(last) = self.tokens.last() {
1962 if matches!(
1963 last.token_type,
1964 TokenType::FlowMappingEnd | TokenType::FlowSequenceEnd
1965 ) && last.end_position.line == self.position.line
1966 {
1967 return Err(Error::scan(
1968 self.position,
1969 "Block-entry `-` immediately after flow collection close"
1970 .to_string(),
1971 ));
1972 }
1973 if matches!(last.token_type, TokenType::Anchor(_) | TokenType::Tag(_))
1974 && last.end_position.line == self.position.line
1975 {
1976 return Err(Error::scan(
1977 self.position,
1978 "Block-entry `-` cannot follow a property on the same line"
1979 .to_string(),
1980 ));
1981 }
1982 // §8.22: a block sequence's first \`-\` must
1983 // begin on a new line. \`key: - a\` (implicit
1984 // key, then dash on same line) is invalid
1985 // (yaml-test-suite 5U3A). But \`? key\\n: - x\`
1986 // (explicit value-separator on the same line
1987 // as the dash) IS valid: the \`?\` key sits
1988 // on a previous line. We distinguish by
1989 // walking back from the Value: if the
1990 // preceding non-property token is a Scalar
1991 // on the same line as the Value, the key
1992 // is implicit; otherwise it's after \`?\`.
1993 if matches!(last.token_type, TokenType::Value)
1994 && last.end_position.line == self.position.line
1995 {
1996 let value_line = last.start_position.line;
1997 let mut prior_scalar_line = None;
1998 for t in self.tokens.iter().rev().skip(1) {
1999 match &t.token_type {
2000 TokenType::Anchor(_) | TokenType::Tag(_) => {}
2001 TokenType::Scalar(..) => {
2002 prior_scalar_line = Some(t.end_position.line);
2003 break;
2004 }
2005 _ => break,
2006 }
2007 }
2008 if prior_scalar_line == Some(value_line) {
2009 return Err(Error::scan(
2010 self.position,
2011 "Block sequence value cannot start on the same line as its key"
2012 .to_string(),
2013 ));
2014 }
2015 }
2016 }
2017 let pos = self.position;
2018 self.advance();
2019
2020 // Check if we need to start a new block sequence
2021 let last_indent = *self.indent_stack.last().unwrap();
2022
2023 // If a compact sequence (opened from `? - x` or
2024 // similar) is already active at this dash's column,
2025 // the dash continues it — don't open a new nested
2026 // block sequence (yaml-test-suite M5DY).
2027 let dash_indent = pos.column.saturating_sub(1);
2028 let compact_active_here = self
2029 .compact_sequence_indents
2030 .last()
2031 .map_or(false, |&si| si == dash_indent);
2032 if compact_active_here {
2033 // Continuation of an existing compact sequence.
2034 } else if self.current_indent > last_indent {
2035 // Deeper indentation - start new nested sequence
2036 self.indent_stack.push(self.current_indent);
2037 self.indent_is_sequence.push(true);
2038 // Check depth limit
2039 self.resource_tracker
2040 .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
2041 self.tokens
2042 .push(Token::simple(TokenType::BlockSequenceStart, pos));
2043 } else if self.current_indent == last_indent
2044 && *self.indent_is_sequence.last().unwrap_or(&false)
2045 {
2046 // Same indent and the top of stack is already a sequence
2047 // → continuation of that sequence; no new start needed.
2048 } else if self.current_indent >= last_indent {
2049 // Same or root level — compact notation.
2050 // Start a new sequence only if we don't already have one
2051 // tracked at this exact indent.
2052 // For a dash that's *not* at line-start (e.g.
2053 // `? - x` where current_indent is still the
2054 // line's indent but the dash sits in mid-line),
2055 // use the dash column - 1 as the sequence's
2056 // indent so scan_plain_scalar's continuation
2057 // check correctly sees the deeper context
2058 // (yaml-test-suite M5DY).
2059 let dash_indent = pos.column.saturating_sub(1);
2060 let seq_indent = dash_indent.max(self.current_indent);
2061 let has_active_compact = self
2062 .compact_sequence_indents
2063 .last()
2064 .map_or(false, |&si| si == seq_indent);
2065
2066 if !has_active_compact {
2067 self.compact_sequence_indents.push(seq_indent);
2068 // Check depth limit
2069 self.resource_tracker.check_depth(
2070 &self.limits,
2071 self.flow_level + self.indent_stack.len(),
2072 )?;
2073 self.tokens
2074 .push(Token::simple(TokenType::BlockSequenceStart, pos));
2075 }
2076 }
2077
2078 self.tokens
2079 .push(Token::new(TokenType::BlockEntry, pos, self.position));
2080
2081 // After emitting BlockEntry, check if the next
2082 // token is another dash (nested sequence). §6.2
2083 // requires SPACE separation between dashes — a
2084 // tab between the outer and inner \`-\` is invalid
2085 // (yaml-test-suite Y79Y/004, /005). Track whether
2086 // a tab was consumed while skipping the inter-
2087 // dash whitespace and reject if so.
2088 let mut saw_tab_between = false;
2089 while let Some(c) = self.current_char {
2090 if c == ' ' {
2091 self.advance();
2092 } else if c == '\t' {
2093 saw_tab_between = true;
2094 self.advance();
2095 } else {
2096 break;
2097 }
2098 }
2099 if self.current_char == Some('-')
2100 && self.peek_char(1).map_or(true, |c| c.is_whitespace())
2101 && saw_tab_between
2102 {
2103 return Err(Error::scan(
2104 self.position,
2105 "Tab between block-entries on same line".to_string(),
2106 ));
2107 }
2108 if self.current_char == Some('-')
2109 && self.peek_char(1).map_or(true, |c| c.is_whitespace())
2110 {
2111 // We have a nested sequence on the same line!
2112 // Track this as an inline sequence
2113 self.inline_sequence_depth += 1;
2114 // Push the *indent* (column - 1), not the
2115 // column, so it matches the convention used by
2116 // maybe_open_block_mapping_for_key. With column
2117 // here the next-line indent (column - 1) would
2118 // be strictly less than the stored value and
2119 // wrongly trigger an early close, breaking
2120 // multi-line nested sequences (yaml-test-suite
2121 // 3ALJ, 57H4).
2122 self.indent_stack
2123 .push(self.position.column.saturating_sub(1));
2124 self.indent_is_sequence.push(true);
2125 // Check depth limit
2126 self.resource_tracker
2127 .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
2128 self.tokens
2129 .push(Token::simple(TokenType::BlockSequenceStart, self.position));
2130 // Continue processing - the next iteration will handle the nested dash
2131 } else if self.current_char.is_some()
2132 && !matches!(self.current_char, Some('\n' | '\r'))
2133 {
2134 // Content follows "- " on the same line.
2135 // Update current_indent to the content's column position so that
2136 // any mapping started here will be at a deeper indent level than
2137 // the sequence. This ensures handle_indentation properly closes
2138 // the mapping when the next sibling "- " appears.
2139 self.current_indent = self.position.column - 1;
2140 }
2141 }
2142
2143 // Quoted strings — same implicit-key mapping detection
2144 // as for plain scalars (yaml-test-suite 6H3V, 6SLA).
2145 '"' | '\'' => {
2146 if self.flow_level == 0 && self.check_for_mapping_ahead() {
2147 self.maybe_open_block_mapping_for_key()?;
2148 }
2149 let token = self.scan_quoted_string(ch)?;
2150 self.tokens.push(token);
2151 }
2152
2153 // Document markers (only if not a block entry).
2154 //
2155 // Reached only when `-` is at column = current_indent + 1 AND
2156 // the next character is non-whitespace — i.e. either the
2157 // `---` document-start marker OR a plain scalar starting
2158 // with `-` (e.g. `---word1`, `-foo`). If `scan_document_start`
2159 // declines, we MUST consume the run as a plain scalar — not
2160 // consulting `is_plain_scalar_start` here, because that helper
2161 // unconditionally rejects `-`, which would leave the outer
2162 // `while let` loop spinning on the same character.
2163 '-' if self.position.column == self.current_indent + 1
2164 && !self.peek_char(1).map_or(true, |c| c.is_whitespace()) =>
2165 {
2166 if let Some(token) = self.scan_document_start()? {
2167 self.tokens.push(token);
2168 } else {
2169 let token = self.scan_plain_scalar()?;
2170 self.tokens.push(token);
2171 }
2172 }
2173 '.' if self.position.column == self.current_indent + 1 => {
2174 if let Some(token) = self.scan_document_end()? {
2175 self.tokens.push(token);
2176 } else if self.is_plain_scalar_start() {
2177 let token = self.scan_plain_scalar()?;
2178 self.tokens.push(token);
2179 }
2180 }
2181
2182 // Numbers or plain scalars starting with -
2183 // Only scan as number if the entire token is numeric (no trailing letters)
2184 _ if (ch.is_ascii_digit()
2185 || (ch == '-' && self.peek_char(1).map_or(false, |c| c.is_ascii_digit())))
2186 && self.is_pure_number() =>
2187 {
2188 let token = self.scan_number()?;
2189 self.tokens.push(token);
2190 }
2191
2192 // Anchors and aliases. §6.9: a node's properties
2193 // (anchor/tag) are prefixes of the node. When an `&`,
2194 // `*`, or `!` is at the start of a line (column ==
2195 // current_indent + 1) and a `: ` follows on the same
2196 // line, the property/alias is part of an implicit
2197 // key's leading position. The block mapping that
2198 // contains this key therefore opens at this column,
2199 // *before* the property/alias token is emitted
2200 // (yaml-test-suite 7BMT, 6BFJ, 9KAX, U3XV, 26DV).
2201 '&' => {
2202 // Mirror H7J7 check for anchors (yaml-test-suite
2203 // G9HC \`seq:\\n&anchor\\n- a\`).
2204 if self.flow_level == 0
2205 && self.position.column == self.current_indent + 1
2206 && !self.check_for_mapping_ahead()
2207 && self.indent_stack.len() > 1
2208 && self.current_indent == self.indent_stack[self.indent_stack.len() - 2]
2209 && self.most_recent_token_is_value_separator()
2210 {
2211 return Err(Error::scan(
2212 self.position,
2213 "Anchor at line-start with insufficient indent for value position"
2214 .to_string(),
2215 ));
2216 }
2217 if self.flow_level == 0
2218 && self.position.column == self.current_indent + 1
2219 && self.check_for_mapping_ahead()
2220 {
2221 self.maybe_open_block_mapping_for_key()?;
2222 }
2223 let token = self.scan_anchor()?;
2224 self.tokens.push(token);
2225 }
2226 '*' => {
2227 // §6.9.2: alias/anchor names may contain \`:\` (only
2228 // flow indicators and whitespace terminate them).
2229 // So \`*a:\` is an alias named \`a:\`, NOT an alias
2230 // \`*a\` followed by a key separator. Don't open
2231 // an implicit block mapping in that case (yaml-
2232 // test-suite 2SXE).
2233 if self.flow_level == 0
2234 && self.position.column == self.current_indent + 1
2235 && self.check_for_mapping_ahead()
2236 && !self.colon_belongs_to_alias_anchor_name()
2237 {
2238 self.maybe_open_block_mapping_for_key()?;
2239 }
2240 let token = self.scan_alias()?;
2241 self.tokens.push(token);
2242 }
2243
2244 // Block scalars
2245 '|' => {
2246 let token = self.scan_literal_block_scalar()?;
2247 self.tokens.push(token);
2248 // Block scalar collection rewinds the cursor to the
2249 // start of the next under-indented line. `current_indent`
2250 // is still set to the inline content's column from the
2251 // enclosing `- |` / `key: |` site, so the next iteration
2252 // would mis-dispatch. Break out so the outer loop
2253 // re-enters `process_line` and reruns indent handling
2254 // (yaml-test-suite 4QFQ, M6YH, P2AD).
2255 break;
2256 }
2257 '>' => {
2258 let token = self.scan_folded_block_scalar()?;
2259 self.tokens.push(token);
2260 break;
2261 }
2262
2263 // Tags. Same line-start property-opens-mapping rule
2264 // (yaml-test-suite ZH7C variants).
2265 //
2266 // §6.9: a property at the SAME indent as the
2267 // enclosing mapping/sequence cannot apply to that
2268 // collection's value — the value must be more
2269 // indented. If we're at a line-start \`!\` whose column
2270 // equals the enclosing mapping's indent + 1 AND that
2271 // mapping currently has a key awaiting a value, the
2272 // tag is misplaced (yaml-test-suite H7J7).
2273 '!' => {
2274 if self.flow_level == 0
2275 && self.position.column == self.current_indent + 1
2276 && !self.check_for_mapping_ahead()
2277 && self.indent_stack.len() > 1
2278 && self.current_indent == self.indent_stack[self.indent_stack.len() - 2]
2279 && self.most_recent_token_is_value_separator()
2280 {
2281 return Err(Error::scan(
2282 self.position,
2283 "Tag at line-start with insufficient indent for value position"
2284 .to_string(),
2285 ));
2286 }
2287 if self.flow_level == 0
2288 && self.position.column == self.current_indent + 1
2289 && self.check_for_mapping_ahead()
2290 {
2291 self.maybe_open_block_mapping_for_key()?;
2292 }
2293 let token = self.scan_tag()?;
2294 self.tokens.push(token);
2295 }
2296
2297 // Plain scalars
2298 _ if self.is_plain_scalar_start() => {
2299 // A plain scalar starting on the SAME line as a
2300 // flow-collection close (\`}\` or \`]\`) means there's
2301 // no separator between the closed flow node and
2302 // the new content (yaml-test-suite 62EZ
2303 // \`x: { y: z }in: valid\`).
2304 if self.flow_level == 0 {
2305 if let Some(last) = self.tokens.last() {
2306 if matches!(
2307 last.token_type,
2308 TokenType::FlowMappingEnd | TokenType::FlowSequenceEnd
2309 ) && last.end_position.line == self.position.line
2310 {
2311 return Err(Error::scan(
2312 self.position,
2313 "Plain scalar immediately after flow collection close"
2314 .to_string(),
2315 ));
2316 }
2317 }
2318 }
2319 if self.flow_level == 0 && self.check_for_mapping_ahead() {
2320 self.maybe_open_block_mapping_for_key()?;
2321 }
2322
2323 let token = self.scan_plain_scalar()?;
2324 self.tokens.push(token);
2325 }
2326
2327 _ => {
2328 let context = ErrorContext::from_input(&self.input, &self.position, 2)
2329 .with_suggestion("Check for valid YAML syntax characters".to_string());
2330 return Err(Error::invalid_character_with_context(
2331 self.position,
2332 ch,
2333 "YAML document",
2334 context,
2335 ));
2336 }
2337 }
2338 }
2339
2340 // Inline sequences (nested \`- -\` on one line) used to be
2341 // closed unconditionally at end-of-line. But a nested sequence
2342 // can span lines (`- - a\n - b\n- c`) — in that case the inner
2343 // sequence must remain open until handle_indentation sees a
2344 // dedent. Reset the inline-sequence counter (so the next line
2345 // is judged on its own merits) but DO NOT emit BlockEnd —
2346 // handle_indentation's indent_stack pop, the end-of-stream
2347 // close at scan_next_token, and the explicit-dedent close at
2348 // handle_indentation's bottom each provide a correct close.
2349 self.inline_sequence_depth = 0;
2350
2351 Ok(())
2352 }
2353
2354 /// Scan the next token lazily
2355 fn scan_next_token(&mut self) -> Result<()> {
2356 if self.done {
2357 return Ok(());
2358 }
2359
2360 // Add stream start token if this is the beginning
2361 if self.tokens.is_empty() {
2362 self.tokens
2363 .push(Token::simple(TokenType::StreamStart, self.position));
2364 return Ok(());
2365 }
2366
2367 // Check if we're at the end of input
2368 if self.current_char.is_none() {
2369 if !self
2370 .tokens
2371 .iter()
2372 .any(|t| matches!(t.token_type, TokenType::StreamEnd))
2373 {
2374 self.tokens
2375 .push(Token::simple(TokenType::StreamEnd, self.position));
2376 }
2377 self.done = true;
2378 return Ok(());
2379 }
2380
2381 // For now, fall back to scanning all tokens at once for the lazy scanner
2382 // This is a simplified implementation - a full streaming parser would
2383 // need more sophisticated state management
2384 let tokens_before = self.tokens.len();
2385 self.scan_all_tokens()?;
2386
2387 // Mark as done after scanning all tokens
2388 if self.tokens.len() == tokens_before {
2389 self.done = true;
2390 }
2391
2392 Ok(())
2393 }
2394
2395 /// Pre-scan all tokens (simplified approach for basic implementation)
2396 fn scan_all_tokens(&mut self) -> Result<()> {
2397 // Only add StreamStart if we don't have it yet
2398 if !self
2399 .tokens
2400 .iter()
2401 .any(|t| matches!(t.token_type, TokenType::StreamStart))
2402 {
2403 self.tokens
2404 .push(Token::simple(TokenType::StreamStart, self.position));
2405 }
2406
2407 while self.current_char.is_some() {
2408 self.process_line()?;
2409
2410 // Advance past newlines
2411 while let Some(ch) = self.current_char {
2412 if ch == '\n' || ch == '\r' {
2413 self.advance();
2414 } else {
2415 break;
2416 }
2417 }
2418 }
2419
2420 // Close any remaining compact sequences (before their parent mappings)
2421 while self.compact_sequence_indents.pop().is_some() {
2422 self.tokens
2423 .push(Token::simple(TokenType::BlockEnd, self.position));
2424 }
2425
2426 // Close any remaining blocks
2427 while self.indent_stack.len() > 1 {
2428 self.indent_stack.pop();
2429 self.indent_is_sequence.pop();
2430 self.tokens
2431 .push(Token::simple(TokenType::BlockEnd, self.position));
2432 }
2433
2434 self.tokens
2435 .push(Token::simple(TokenType::StreamEnd, self.position));
2436 self.done = true;
2437 Ok(())
2438 }
2439
2440 /// Peek at a character at the given offset (can be negative)
2441 /// Check if the current position starts a pure number (digits/dots/minus only,
2442 /// not followed by letters). Values like 500m, 128Mi should be treated as plain scalars.
2443 fn is_pure_number(&self) -> bool {
2444 let mut offset: isize = 0;
2445 let first = self.peek_char(0);
2446 // Skip leading minus
2447 if first == Some('-') {
2448 offset = 1;
2449 }
2450 // Scan digits and at most one dot
2451 let mut has_digit = false;
2452 let mut dot_count = 0;
2453 loop {
2454 match self.peek_char(offset) {
2455 Some(c) if c.is_ascii_digit() => {
2456 has_digit = true;
2457 offset += 1;
2458 }
2459 Some('.') => {
2460 dot_count += 1;
2461 if dot_count > 1 {
2462 // Multiple dots (e.g. 0.5.8) — not a number
2463 return false;
2464 }
2465 offset += 1;
2466 }
2467 Some(c) if c.is_ascii_alphabetic() || c == '_' => {
2468 // Letters follow the digits — not a pure number (e.g. 500m, 128Mi)
2469 return false;
2470 }
2471 Some(c) => {
2472 // For a token to be a pure number, what follows
2473 // the digits must be end-of-token. In flow
2474 // context that's a flow indicator. In block
2475 // context the rest of the line must be pure
2476 // whitespace (possibly trailing a comment) — if
2477 // there's more non-whitespace content on this
2478 // line, the digits are part of a larger plain
2479 // scalar like \`1 - 3\` (yaml-test-suite P76L)
2480 // or \`20:03:20\` (yaml-test-suite U9NS).
2481 if self.flow_level > 0 && ",[]{}".contains(c) {
2482 return has_digit;
2483 }
2484 if c == '\n' || c == '\r' {
2485 return has_digit;
2486 }
2487 if c == ' ' || c == '\t' {
2488 // Look ahead: rest of line must be whitespace
2489 // or a comment.
2490 let mut probe = offset + 1;
2491 loop {
2492 match self.peek_char(probe) {
2493 None => return has_digit,
2494 Some('\n' | '\r') => return has_digit,
2495 Some('#') => return has_digit,
2496 Some(' ' | '\t') => probe += 1,
2497 Some(_) => return false,
2498 }
2499 }
2500 }
2501 if c == ':' {
2502 let next = self.peek_char(offset + 1);
2503 return has_digit && next.map_or(true, |nc| nc.is_whitespace());
2504 }
2505 return false;
2506 }
2507 None => return has_digit,
2508 }
2509 }
2510 }
2511
2512 fn peek_char(&self, offset: isize) -> Option<char> {
2513 if offset >= 0 {
2514 let target_index = self.current_char_index + offset as usize;
2515 if target_index < self.char_cache.len() {
2516 Some(self.char_cache[target_index])
2517 } else {
2518 None
2519 }
2520 } else {
2521 let offset_magnitude = (-offset) as usize;
2522 if self.current_char_index >= offset_magnitude {
2523 Some(self.char_cache[self.current_char_index - offset_magnitude])
2524 } else {
2525 None
2526 }
2527 }
2528 }
2529
2530 /// Scan an anchor token (&name)
2531 fn scan_anchor(&mut self) -> Result<Token> {
2532 let start_pos = self.position;
2533 self.advance(); // Skip '&'
2534
2535 let name = self.scan_identifier()?;
2536 if name.is_empty() {
2537 let context = ErrorContext::from_input(&self.input, &self.position, 2).with_suggestion(
2538 "Provide a valid anchor name after &, e.g., &anchor_name".to_string(),
2539 );
2540 return Err(Error::scan_with_context(
2541 self.position,
2542 "Anchor name cannot be empty",
2543 context,
2544 ));
2545 }
2546
2547 // Track anchor for resource limits
2548 self.resource_tracker.add_anchor(&self.limits)?;
2549
2550 Ok(Token::new(
2551 TokenType::Anchor(name),
2552 start_pos,
2553 self.position,
2554 ))
2555 }
2556
2557 /// Scan an alias token (*name)
2558 fn scan_alias(&mut self) -> Result<Token> {
2559 let start_pos = self.position;
2560 self.advance(); // Skip '*'
2561
2562 let name = self.scan_identifier()?;
2563 if name.is_empty() {
2564 let context = ErrorContext::from_input(&self.input, &self.position, 2).with_suggestion(
2565 "Provide a valid alias name after *, e.g., *alias_name".to_string(),
2566 );
2567 return Err(Error::scan_with_context(
2568 self.position,
2569 "Alias name cannot be empty",
2570 context,
2571 ));
2572 }
2573
2574 Ok(Token::new(TokenType::Alias(name), start_pos, self.position))
2575 }
2576
2577 /// Scan an identifier (used for anchor and alias names)
2578 fn scan_identifier(&mut self) -> Result<String> {
2579 // Per YAML 1.2 §6.9.2 (ns-anchor-name = ns-anchor-char+), the only
2580 // exclusions are whitespace and the flow indicators `,[]{}`. This
2581 // accepts ASCII alphanumeric, underscore, hyphen, AND full unicode
2582 // codepoints (including emoji), matching the spec exactly.
2583 let mut identifier = String::new();
2584 while let Some(ch) = self.current_char {
2585 if ch.is_whitespace() || matches!(ch, ',' | '[' | ']' | '{' | '}') {
2586 break;
2587 }
2588 identifier.push(ch);
2589 self.advance();
2590 }
2591 Ok(identifier)
2592 }
2593
2594 /// Scan a tag token (`!tag`, `!!tag`, or `!<verbatim>`).
2595 fn scan_tag(&mut self) -> Result<Token> {
2596 let start_pos = self.position;
2597 self.advance(); // Skip first '!'
2598
2599 let mut tag = String::from("!");
2600
2601 // Check for verbatim tag format: !<tag>
2602 if self.current_char == Some('<') {
2603 tag.push('<');
2604 self.advance(); // Skip '<'
2605
2606 // Scan until closing '>'
2607 while let Some(ch) = self.current_char {
2608 if ch == '>' {
2609 tag.push(ch);
2610 self.advance();
2611 break;
2612 } else if ch.is_control() || ch.is_whitespace() {
2613 return Err(Error::scan(
2614 self.position,
2615 "Invalid character in verbatim tag".to_string(),
2616 ));
2617 }
2618 tag.push(ch);
2619 self.advance();
2620 }
2621 } else {
2622 // Check for secondary tag handle: !!
2623 if self.current_char == Some('!') {
2624 tag.push('!');
2625 self.advance(); // Skip second '!'
2626 }
2627
2628 // Scan tag name/suffix.
2629 //
2630 // Per YAML 1.2 §5.6, tag suffixes are URI references — they may
2631 // contain any URI character (RFC 3986 unreserved + sub-delims +
2632 // a few others) or `%XX` percent-encoded bytes. The handful of
2633 // characters listed below covers the alphanumeric + URI-safe
2634 // punctuation set used by yaml-test-suite. Percent decoding of
2635 // `%XX` happens later in `TagResolver::resolve`.
2636 //
2637 // §5.3: inside a flow collection, the flow indicators
2638 // `,`, `[`, `]`, `{`, `}` always terminate a node — so we
2639 // must NOT consume them into the tag suffix even though
2640 // RFC 3986 permits them in URIs (yaml-test-suite WZ62).
2641 // YAML 1.2 in practice treats `,` as a flow indicator that
2642 // must be percent-encoded (\`%2C\`) when it appears inside
2643 // a tag suffix — bare \`,\` is not allowed in EITHER block
2644 // or flow context (yaml-test-suite U99R).
2645 while let Some(ch) = self.current_char {
2646 if matches!(ch, ',') {
2647 break;
2648 }
2649 if self.flow_level > 0 && matches!(ch, '[' | ']' | '{' | '}') {
2650 break;
2651 }
2652 // §6.8 / §5.6: `:` IS a valid tag URI character — e.g.
2653 // `tag:yaml.org,2002:str` legitimately contains two
2654 // colons inside its URI. But a `:` followed by
2655 // whitespace, EOL or EOF is the YAML mapping-value
2656 // indicator and MUST terminate the tag, otherwise
2657 // `!handle!suffix: value` is mis-scanned as
2658 // `Tag("!handle!suffix:") Scalar("value")` and the
2659 // implicit-key mapping structure is lost. Mirrors the
2660 // `,` carve-out above (a valid URI char that's also a
2661 // YAML flow indicator in some contexts).
2662 if ch == ':' {
2663 match self.peek_char(1) {
2664 None => break,
2665 Some(c) if c.is_whitespace() => break,
2666 _ => {}
2667 }
2668 }
2669 if ch.is_alphanumeric() || "-._~:/?#[]@!$&'()*+;=%".contains(ch) {
2670 tag.push(ch);
2671 self.advance();
2672 } else {
2673 break;
2674 }
2675 }
2676 }
2677
2678 Ok(Token::new(TokenType::Tag(tag), start_pos, self.position))
2679 }
2680
2681 /// Scan a literal block scalar (|)
2682 fn scan_literal_block_scalar(&mut self) -> Result<Token> {
2683 let start_pos = self.position;
2684 self.advance(); // Skip '|'
2685
2686 // Parse block scalar header (indicators like +, -, explicit indent)
2687 let (chomping, explicit_indent) = self.scan_block_scalar_header()?;
2688
2689 // Skip to next line
2690 self.skip_to_next_line()?;
2691
2692 // Determine indentation. `base_indent` is the surrounding
2693 // block's indent — i.e. the indent of the sequence or
2694 // mapping that contains this scalar. `self.current_indent`
2695 // is sometimes set to the inline indicator column (e.g. 2
2696 // for `- |`), which would make `base_indent + explicit`
2697 // wrong; use the top of `indent_stack` instead
2698 // (yaml-test-suite 4QFQ `|1`).
2699 let base_indent = self.indent_stack.last().copied().unwrap_or(0);
2700 let content_indent = if let Some(explicit) = explicit_indent {
2701 base_indent + explicit
2702 } else {
2703 // Find the first non-empty content line to determine indentation
2704 self.find_block_scalar_indent(base_indent)?
2705 };
2706
2707 // Collect the literal block content
2708 let content = self.collect_literal_block_content(content_indent, chomping)?;
2709
2710 Ok(Token::new(
2711 TokenType::BlockScalarLiteral(content),
2712 start_pos,
2713 self.position,
2714 ))
2715 }
2716
2717 /// Scan a folded block scalar (>)
2718 fn scan_folded_block_scalar(&mut self) -> Result<Token> {
2719 let start_pos = self.position;
2720 self.advance(); // Skip '>'
2721
2722 // Parse block scalar header (indicators like +, -, explicit indent)
2723 let (chomping, explicit_indent) = self.scan_block_scalar_header()?;
2724
2725 // Skip to next line
2726 self.skip_to_next_line()?;
2727
2728 // See scan_literal_block_scalar for why we read `indent_stack`
2729 // rather than `current_indent`.
2730 let base_indent = self.indent_stack.last().copied().unwrap_or(0);
2731 let content_indent = if let Some(explicit) = explicit_indent {
2732 base_indent + explicit
2733 } else {
2734 // Find the first non-empty content line to determine indentation
2735 self.find_block_scalar_indent(base_indent)?
2736 };
2737
2738 // Collect the folded block content
2739 let content = self.collect_folded_block_content(content_indent, chomping)?;
2740
2741 Ok(Token::new(
2742 TokenType::BlockScalarFolded(content),
2743 start_pos,
2744 self.position,
2745 ))
2746 }
2747
2748 /// Parse block scalar header indicators (+, -, and explicit indent)
2749 fn scan_block_scalar_header(&mut self) -> Result<(ChompingMode, Option<usize>)> {
2750 let mut chomping = ChompingMode::Clip;
2751 let mut explicit_indent: Option<usize> = None;
2752 // §6.6: a comment must be preceded by whitespace. \`|#x\` and
2753 // \`>#x\` are invalid (yaml-test-suite X4QW).
2754 let mut seen_separator_ws = false;
2755
2756 // Parse indicators in any order
2757 while let Some(ch) = self.current_char {
2758 match ch {
2759 '+' => {
2760 chomping = ChompingMode::Keep;
2761 self.advance();
2762 }
2763 '-' => {
2764 chomping = ChompingMode::Strip;
2765 self.advance();
2766 }
2767 '0'..='9' => {
2768 let digit = ch.to_digit(10).unwrap() as usize;
2769 if explicit_indent.is_some() {
2770 let context = ErrorContext::from_input(&self.input, &self.position, 2)
2771 .with_suggestion(
2772 "Use only one indent indicator digit in block scalar".to_string(),
2773 );
2774 return Err(Error::scan_with_context(
2775 self.position,
2776 "Multiple indent indicators in block scalar",
2777 context,
2778 ));
2779 }
2780 // YAML 1.2 §8.1.1.1: explicit indent indicator is
2781 // 1..=9. `|0` and `>0` are invalid
2782 // (yaml-test-suite 2G84/00).
2783 if digit == 0 {
2784 let context = ErrorContext::from_input(&self.input, &self.position, 2)
2785 .with_suggestion(
2786 "Block-scalar indent indicator must be 1-9".to_string(),
2787 );
2788 return Err(Error::scan_with_context(
2789 self.position,
2790 "Block-scalar indent indicator `0` is invalid",
2791 context,
2792 ));
2793 }
2794 explicit_indent = Some(digit);
2795 self.advance();
2796 }
2797 ' ' | '\t' => {
2798 seen_separator_ws = true;
2799 self.advance(); // Skip whitespace
2800 }
2801 '#' => {
2802 if !seen_separator_ws {
2803 return Err(Error::scan(
2804 self.position,
2805 "Comment in block-scalar header must be preceded by whitespace"
2806 .to_string(),
2807 ));
2808 }
2809 // Skip comment to end of line
2810 while let Some(ch) = self.current_char {
2811 self.advance();
2812 if ch == '\n' || ch == '\r' {
2813 break;
2814 }
2815 }
2816 break;
2817 }
2818 '\n' | '\r' => break,
2819 _ => {
2820 let context = ErrorContext::from_input(&self.input, &self.position, 2)
2821 .with_suggestion("Use valid block scalar indicators: | (literal), > (folded), + (keep), - (strip), or digit (indent)".to_string());
2822 return Err(Error::invalid_character_with_context(
2823 self.position,
2824 ch,
2825 "block scalar header",
2826 context,
2827 ));
2828 }
2829 }
2830 }
2831
2832 Ok((chomping, explicit_indent))
2833 }
2834
2835 /// Advance the cursor PAST the next line break, but do not consume
2836 /// any leading whitespace on the line that follows. The block-
2837 /// scalar header parser uses this to step from the indicator line
2838 /// to the start of the content line — the next line's leading
2839 /// spaces are part of its content_indent, not header whitespace.
2840 fn skip_to_next_line(&mut self) -> Result<()> {
2841 // If we're already at column 1 (the comment handler in
2842 // scan_block_scalar_header may have already advanced past a
2843 // newline), do nothing — the next line's leading whitespace
2844 // belongs to its content_indent.
2845 if self.position.column == 1 {
2846 return Ok(());
2847 }
2848 while let Some(ch) = self.current_char {
2849 match ch {
2850 '\n' | '\r' => {
2851 self.advance();
2852 return Ok(());
2853 }
2854 ' ' | '\t' => {
2855 self.advance();
2856 }
2857 _ => return Ok(()),
2858 }
2859 }
2860 Ok(())
2861 }
2862
2863 /// Find the content indentation for a block scalar.
2864 ///
2865 /// Per spec §8.1.1.1, indent is the leading-space count of the first
2866 /// non-empty content line (or the longest blank-line indent if no
2867 /// non-empty line exists). A non-empty line whose indent is not
2868 /// strictly deeper than `base_indent` is outside the scalar's
2869 /// scope — that line is a sibling structure, not content
2870 /// (yaml-test-suite K858).
2871 fn find_block_scalar_indent(&mut self, base_indent: usize) -> Result<usize> {
2872 let saved_position = self.position;
2873 let saved_char = self.current_char;
2874 let saved_char_index = self.current_char_index;
2875
2876 let mut max_blank_indent: usize = 0;
2877 let mut found = false;
2878 let mut content_indent: usize = 1;
2879
2880 loop {
2881 let mut line_indent = 0;
2882 while self.current_char == Some(' ') {
2883 line_indent += 1;
2884 self.advance();
2885 }
2886 // §6.1 + §8.1: tabs cannot serve as block-scalar
2887 // indentation. A line that BEGINS with a tab (no leading
2888 // spaces) inside the block scalar's indent search is
2889 // invalid (yaml-test-suite Y79Y/000 \`foo: |\\n\\tbar\`).
2890 // Tabs that appear AFTER one or more spaces are content,
2891 // not indentation, and remain valid (yaml-test-suite
2892 // 96NN/00 \`foo: |-\\n \\tbar\`).
2893 if line_indent == 0 && self.current_char == Some('\t') {
2894 return Err(Error::scan(
2895 self.position,
2896 "Tab cannot serve as block-scalar indentation".to_string(),
2897 ));
2898 }
2899
2900 match self.current_char {
2901 None => {
2902 if line_indent > max_blank_indent {
2903 max_blank_indent = line_indent;
2904 }
2905 break;
2906 }
2907 Some('\n' | '\r') => {
2908 if line_indent > max_blank_indent {
2909 max_blank_indent = line_indent;
2910 }
2911 self.advance();
2912 // fall through to next iteration
2913 }
2914 Some(_) => {
2915 // If we're nested inside another block — either
2916 // via the `indent_stack` (normal mapping/sequence
2917 // open) or `compact_sequence_indents` (a
2918 // compact block sequence at the same indent as
2919 // its parent) — and this candidate line is not
2920 // strictly deeper than base_indent, it's a
2921 // sibling outside the scalar's scope (yaml-test-
2922 // suite K858, P2AD).
2923 let inside_block =
2924 self.indent_stack.len() > 1 || !self.compact_sequence_indents.is_empty();
2925 if inside_block && line_indent <= base_indent {
2926 content_indent = max_blank_indent.max(base_indent + 1);
2927 } else {
2928 content_indent = line_indent;
2929 }
2930 // §8.1.2.1: leading blank lines may not exceed the
2931 // detected content indent — that ambiguity is
2932 // invalid (yaml-test-suite W9L4, S98Z).
2933 if max_blank_indent > content_indent {
2934 self.position = saved_position;
2935 self.current_char = saved_char;
2936 self.current_char_index = saved_char_index;
2937 return Err(Error::scan(
2938 self.position,
2939 "Block scalar leading blank-line indent exceeds content indent"
2940 .to_string(),
2941 ));
2942 }
2943 found = true;
2944 break;
2945 }
2946 }
2947 }
2948
2949 if !found {
2950 content_indent = max_blank_indent;
2951 }
2952
2953 self.position = saved_position;
2954 self.current_char = saved_char;
2955 self.current_char_index = saved_char_index;
2956
2957 Ok(content_indent)
2958 }
2959
2960 /// Count indentation at start of current line
2961 fn count_line_indent(&mut self) -> usize {
2962 let mut indent = 0;
2963 let saved_position = self.position;
2964 let saved_char = self.current_char;
2965 let saved_char_index = self.current_char_index;
2966
2967 while let Some(ch) = self.current_char {
2968 if ch == ' ' {
2969 indent += 1;
2970 self.advance();
2971 } else if ch == '\t' {
2972 indent += 8; // Tab counts as 8 spaces
2973 self.advance();
2974 } else {
2975 break;
2976 }
2977 }
2978
2979 // Restore position
2980 self.position = saved_position;
2981 self.current_char = saved_char;
2982 self.current_char_index = saved_char_index;
2983
2984 indent
2985 }
2986
2987 /// Collect content for a literal block scalar.
2988 ///
2989 /// Each line is preserved with its terminating newline. After collection
2990 /// we apply the chomping mode per spec §8.1.1.2.
2991 fn collect_literal_block_content(
2992 &mut self,
2993 content_indent: usize,
2994 chomping: ChompingMode,
2995 ) -> Result<String> {
2996 let mut content = String::new();
2997
2998 loop {
2999 // Count current line's leading-space indent.
3000 let mut line_indent = 0;
3001 let save_pos = self.position;
3002 let save_ch = self.current_char;
3003 let save_idx = self.current_char_index;
3004 while self.current_char == Some(' ') {
3005 line_indent += 1;
3006 self.advance();
3007 }
3008
3009 let line_is_blank = matches!(self.current_char, Some('\n' | '\r') | None);
3010
3011 if !line_is_blank && line_indent < content_indent {
3012 // Non-empty line with less indent ends the scalar; rewind.
3013 self.position = save_pos;
3014 self.current_char = save_ch;
3015 self.current_char_index = save_idx;
3016 break;
3017 }
3018
3019 // Document marker at line start always ends the scalar,
3020 // regardless of content_indent (allows zero-indented
3021 // block scalars per yaml-test-suite FP8R).
3022 if line_indent == 0 && self.is_doc_marker_here() {
3023 self.position = save_pos;
3024 self.current_char = save_ch;
3025 self.current_char_index = save_idx;
3026 break;
3027 }
3028
3029 if line_is_blank {
3030 // A blank line counts when there's an actual line break
3031 // to consume. EOF after we've consumed some whitespace
3032 // on the trailing line ALSO counts as one final blank
3033 // line (yaml-test-suite JEF9/02: `- |+\n `).
3034 if matches!(self.current_char, Some('\n' | '\r')) {
3035 // Whitespace beyond content_indent is literal content
3036 // even on blank lines (yaml-test-suite 6FWR).
3037 for _ in content_indent..line_indent {
3038 content.push(' ');
3039 }
3040 content.push('\n');
3041 self.advance();
3042 continue;
3043 }
3044 if line_indent > 0 {
3045 for _ in content_indent..line_indent {
3046 content.push(' ');
3047 }
3048 content.push('\n');
3049 }
3050 break;
3051 }
3052
3053 // Content line: we already consumed `line_indent` spaces, but
3054 // only `content_indent` of them belong to indentation. Any
3055 // extra leading spaces are literal content.
3056 let mut line = String::new();
3057 for _ in content_indent..line_indent {
3058 line.push(' ');
3059 }
3060 while let Some(ch) = self.current_char {
3061 if ch == '\n' || ch == '\r' {
3062 self.advance();
3063 break;
3064 }
3065 line.push(ch);
3066 self.advance();
3067 }
3068 content.push_str(&line);
3069 content.push('\n');
3070
3071 if self.current_char.is_none() {
3072 break;
3073 }
3074 }
3075
3076 Ok(apply_chomping(content, chomping))
3077 }
3078
3079 /// Check if cursor is at `---` or `...` followed by whitespace/EOL.
3080 fn is_doc_marker_here(&self) -> bool {
3081 let c0 = self.current_char;
3082 let c1 = self.peek_char(1);
3083 let c2 = self.peek_char(2);
3084 let c3 = self.peek_char(3);
3085 let trailing_ok = c3.map_or(true, |c| c.is_whitespace());
3086 (c0 == Some('-') && c1 == Some('-') && c2 == Some('-') && trailing_ok)
3087 || (c0 == Some('.') && c1 == Some('.') && c2 == Some('.') && trailing_ok)
3088 }
3089
3090 /// Collect content for a folded block scalar.
3091 ///
3092 /// Folding rules (§8.1.3): a sequence of single blank lines between
3093 /// equally-indented non-empty content lines collapses into a single
3094 /// space; runs of blank lines emit `n-1` newlines; more-indented
3095 /// lines preserve their newline boundaries. After collection, apply
3096 /// chomping (§8.1.1.2).
3097 fn collect_folded_block_content(
3098 &mut self,
3099 content_indent: usize,
3100 chomping: ChompingMode,
3101 ) -> Result<String> {
3102 #[derive(Clone, Copy, PartialEq, Eq)]
3103 enum LineKind {
3104 Normal,
3105 MoreIndented,
3106 Empty,
3107 }
3108 struct Line {
3109 text: String,
3110 kind: LineKind,
3111 }
3112
3113 let mut lines: Vec<Line> = Vec::new();
3114
3115 loop {
3116 let mut line_indent = 0;
3117 let save_pos = self.position;
3118 let save_ch = self.current_char;
3119 let save_idx = self.current_char_index;
3120 while self.current_char == Some(' ') {
3121 line_indent += 1;
3122 self.advance();
3123 }
3124
3125 let line_is_blank = matches!(self.current_char, Some('\n' | '\r') | None);
3126
3127 if !line_is_blank && line_indent < content_indent {
3128 self.position = save_pos;
3129 self.current_char = save_ch;
3130 self.current_char_index = save_idx;
3131 break;
3132 }
3133
3134 if line_indent == 0 && self.is_doc_marker_here() {
3135 self.position = save_pos;
3136 self.current_char = save_ch;
3137 self.current_char_index = save_idx;
3138 break;
3139 }
3140
3141 if line_is_blank {
3142 if matches!(self.current_char, Some('\n' | '\r')) {
3143 lines.push(Line {
3144 text: String::new(),
3145 kind: LineKind::Empty,
3146 });
3147 self.advance();
3148 continue;
3149 }
3150 break;
3151 }
3152
3153 // Capture extra-indent leading spaces as part of content.
3154 let mut text = String::new();
3155 for _ in content_indent..line_indent {
3156 text.push(' ');
3157 }
3158 while let Some(ch) = self.current_char {
3159 if ch == '\n' || ch == '\r' {
3160 self.advance();
3161 break;
3162 }
3163 text.push(ch);
3164 self.advance();
3165 }
3166 // §8.1.3.2: "more indented" means the content (after the
3167 // common indent strip) begins with extra whitespace —
3168 // either spaces or tabs (yaml-test-suite MJS9).
3169 let kind = if text.starts_with(' ') || text.starts_with('\t') {
3170 LineKind::MoreIndented
3171 } else {
3172 LineKind::Normal
3173 };
3174 lines.push(Line { text, kind });
3175
3176 if self.current_char.is_none() {
3177 break;
3178 }
3179 }
3180
3181 // Build the folded output.
3182 let mut content = String::new();
3183 let mut idx = 0;
3184 while idx < lines.len() {
3185 let line = &lines[idx];
3186 match line.kind {
3187 LineKind::Normal | LineKind::MoreIndented => {
3188 content.push_str(&line.text);
3189 // Lookahead: count immediately-following empty lines.
3190 let mut j = idx + 1;
3191 let mut empties = 0;
3192 while j < lines.len() && lines[j].kind == LineKind::Empty {
3193 empties += 1;
3194 j += 1;
3195 }
3196 if j < lines.len() {
3197 // Spec §8.1.3.2: folding behaviour depends on
3198 // whether either surrounding content line is
3199 // "more indented" than the content indent.
3200 // - both Normal, 0 empties → fold to space.
3201 // - both Normal, k empties → k newlines (one
3202 // break folded out).
3203 // - any MoreIndented, 0 empties → 1 newline.
3204 // - any MoreIndented, k empties → k+1 newlines
3205 // (every break preserved).
3206 let mi_adjacent = line.kind == LineKind::MoreIndented
3207 || lines[j].kind == LineKind::MoreIndented;
3208 if empties == 0 {
3209 if mi_adjacent {
3210 content.push('\n');
3211 } else {
3212 content.push(' ');
3213 }
3214 } else {
3215 let breaks = if mi_adjacent { empties + 1 } else { empties };
3216 for _ in 0..breaks {
3217 content.push('\n');
3218 }
3219 }
3220 idx = j;
3221 } else {
3222 // End of stream after content (possibly trailing empties).
3223 // Always emit final `\n` for the last content line; extra
3224 // trailing empties contribute additional `\n`s, and chomping
3225 // will trim them later if needed.
3226 content.push('\n');
3227 for _ in 0..empties {
3228 content.push('\n');
3229 }
3230 break;
3231 }
3232 }
3233 LineKind::Empty => {
3234 // Leading empty lines (no preceding content): emit as `\n`s.
3235 content.push('\n');
3236 idx += 1;
3237 }
3238 }
3239 }
3240
3241 Ok(apply_chomping(content, chomping))
3242 }
3243
3244 /// Emit a `BlockMappingStart` token if the current position is the
3245 /// start of an implicit key and no mapping is yet active at this
3246 /// indent level. Shared by plain and quoted scalar dispatch.
3247 fn maybe_open_block_mapping_for_key(&mut self) -> Result<()> {
3248 let last_indent = *self.indent_stack.last().unwrap();
3249 let should_start_new_mapping = if self.current_indent > last_indent {
3250 true
3251 } else if self.current_indent == last_indent {
3252 !self.check_active_mapping_at_level(self.current_indent)
3253 } else {
3254 false
3255 };
3256 if should_start_new_mapping {
3257 // §6.1 + §8.22: opening a NEW block mapping at deeper
3258 // indent than the parent only makes sense if the parent
3259 // has a key WITHOUT a value (the new mapping IS that
3260 // value). If the parent's last content is a complete
3261 // (key, value) pair — i.e. the most recent meaningful
3262 // token is a value-position scalar/alias/close — then
3263 // there's no node to host the deeper mapping (yaml-test-
3264 // suite U44R: \`map:\\n key1: q\\n key2: bad\` — key2
3265 // is deeper than key1 but key1's value is already \`q\`).
3266 if self.current_indent > last_indent && last_indent > 0 {
3267 let mut depth = 0i32;
3268 let mut last_meaningful = None;
3269 for t in self.tokens.iter().rev() {
3270 match &t.token_type {
3271 TokenType::BlockEnd => depth += 1,
3272 TokenType::BlockMappingStart | TokenType::BlockSequenceStart => {
3273 if depth == 0 {
3274 break;
3275 }
3276 depth -= 1;
3277 }
3278 TokenType::Anchor(_) | TokenType::Tag(_) => {}
3279 other => {
3280 if depth == 0 {
3281 last_meaningful = Some(other.clone());
3282 break;
3283 }
3284 }
3285 }
3286 }
3287 if matches!(
3288 last_meaningful,
3289 Some(
3290 TokenType::Scalar(..)
3291 | TokenType::Alias(_)
3292 | TokenType::FlowSequenceEnd
3293 | TokenType::FlowMappingEnd
3294 | TokenType::BlockScalarLiteral(..)
3295 | TokenType::BlockScalarFolded(..)
3296 )
3297 ) {
3298 return Err(Error::scan(
3299 self.position,
3300 "Indentation increase has no parent in current mapping/sequence"
3301 .to_string(),
3302 ));
3303 }
3304 }
3305 self.indent_stack.push(self.current_indent);
3306 self.indent_is_sequence.push(false);
3307 self.resource_tracker
3308 .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
3309 self.tokens
3310 .push(Token::simple(TokenType::BlockMappingStart, self.position));
3311 }
3312 Ok(())
3313 }
3314
3315 /// Look ahead on the current line for a `:` that marks a mapping key.
3316 ///
3317 /// Per YAML 1.2 §7.3.3, a plain scalar may contain a `:` that is not
3318 /// followed by whitespace. Only `: ` terminates the scalar. If the
3319 /// line begins with `"` or `'`, the leading quoted scalar's contents
3320 /// are scanned past (including `''` and `\"` escapes) before looking
3321 /// for the `: ` that would make this scalar a key. This handles
3322 /// yaml-test-suite 6H3V (`'foo: bar\': baz'`) and 6SLA.
3323 /// For an alias/anchor at the current position, scan past
3324 /// the `&`/`*` and the name characters; if the FIRST char that
3325 /// would terminate the name is `:`, the colon is PART of the
3326 /// alias/anchor name (yaml-test-suite 2SXE). Returns true in
3327 /// that case so the caller can skip the implicit-key fast-path.
3328 fn colon_belongs_to_alias_anchor_name(&self) -> bool {
3329 // Start after the `&` / `*` introducer.
3330 let mut i = self.current_char_index + 1;
3331 let n = self.char_cache.len();
3332 // Per scan_identifier rules: stop at whitespace or flow indicator.
3333 while i < n {
3334 let c = self.char_cache[i];
3335 if c.is_whitespace() || matches!(c, ',' | '[' | ']' | '{' | '}') {
3336 break;
3337 }
3338 i += 1;
3339 }
3340 // If the next char (or last consumed?) at termination is `:`,
3341 // then the name ended with `:`. Look at the LAST consumed
3342 // char. Actually our scan_identifier accepts `:` as part of
3343 // name — so the colon is already in the name. There's no
3344 // separate "value indicator" colon after.
3345 //
3346 // For the implicit-key fast path to be wrong, we need the
3347 // name to END with `:` (last char of name is `:`).
3348 if i > self.current_char_index + 1 {
3349 let last_name_char = self.char_cache[i - 1];
3350 if last_name_char == ':' {
3351 return true;
3352 }
3353 }
3354 false
3355 }
3356
3357 /// Scan ahead on the current line (the rest of the post-indent
3358 /// content) to determine whether it looks like an implicit
3359 /// mapping key — i.e. has a `: ` separator (or `:` at line end)
3360 /// before any newline.
3361 fn line_after_indent_is_implicit_key(&self) -> bool {
3362 let mut i = self.current_char_index;
3363 let n = self.char_cache.len();
3364 while i < n {
3365 let ch = self.char_cache[i];
3366 if ch == '\n' || ch == '\r' {
3367 return false;
3368 }
3369 if ch == ':' {
3370 let next = self.char_cache.get(i + 1).copied();
3371 if next.is_none() || next.map_or(false, |c| c.is_whitespace()) {
3372 return true;
3373 }
3374 }
3375 i += 1;
3376 }
3377 false
3378 }
3379
3380 /// Walk back through recent tokens; if the last non-property
3381 /// token was `Value` (`:`), the parser is in value-expectation
3382 /// mode (key not yet matched with a value).
3383 fn most_recent_token_is_value_separator(&self) -> bool {
3384 for t in self.tokens.iter().rev() {
3385 match t.token_type {
3386 TokenType::Anchor(_) | TokenType::Tag(_) => {}
3387 TokenType::Value => return true,
3388 _ => return false,
3389 }
3390 }
3391 false
3392 }
3393
3394 fn check_for_mapping_ahead(&self) -> bool {
3395 let mut i = self.current_char_index;
3396 let n = self.char_cache.len();
3397 if i < n {
3398 let first = self.char_cache[i];
3399 if first == '\'' || first == '"' {
3400 let quote = first;
3401 i += 1;
3402 while i < n {
3403 let c = self.char_cache[i];
3404 if c == '\n' || c == '\r' {
3405 return false; // unterminated quote on line
3406 }
3407 if quote == '\'' && c == '\'' && self.char_cache.get(i + 1) == Some(&'\'') {
3408 // `''` is the in-string single-quote escape.
3409 i += 2;
3410 continue;
3411 }
3412 if quote == '"' && c == '\\' {
3413 // Skip the escaped char.
3414 i += 2;
3415 continue;
3416 }
3417 if c == quote {
3418 i += 1;
3419 break;
3420 }
3421 i += 1;
3422 }
3423 }
3424 }
3425 // Skip balanced flow collections — a `:` *inside* `[...]` or
3426 // `{...}` does NOT make the line a block-mapping key (the flow
3427 // collection itself can BE the key, but its inner colons are
3428 // part of its own structure). yaml-test-suite: `{key: v}` is
3429 // a standalone flow mapping; `[a]: outer` is a block-map key.
3430 let mut flow_depth: i32 = 0;
3431 while i < n {
3432 let ch = self.char_cache[i];
3433 match ch {
3434 '\n' | '\r' => return false,
3435 '[' | '{' => flow_depth += 1,
3436 ']' | '}' => flow_depth -= 1,
3437 ':' if flow_depth <= 0 => {
3438 let next = self.char_cache.get(i + 1).copied();
3439 match next {
3440 None => return true,
3441 Some(c) if c.is_whitespace() => return true,
3442 _ => {}
3443 }
3444 }
3445 _ => {}
3446 }
3447 i += 1;
3448 }
3449 false
3450 }
3451
3452 /// Check if there's an active mapping at the specified indentation level
3453 /// This method properly handles BlockEnd tokens by tracking mapping start/end pairs
3454 fn check_active_mapping_at_level(&self, _target_indent: usize) -> bool {
3455 let mut depth = 0;
3456
3457 // Walk backwards through tokens to find the innermost unmatched block start.
3458 // Every BlockEnd increments depth; BlockMappingStart and BlockSequenceStart
3459 // decrement it (both open blocks that need a matching BlockEnd).
3460 // When depth == 0 we have found the block start that is still "open".
3461 for token in self.tokens.iter().rev() {
3462 match &token.token_type {
3463 TokenType::BlockMappingStart => {
3464 if depth == 0 {
3465 // The innermost open block is a mapping — active at this level.
3466 return true;
3467 }
3468 depth -= 1;
3469 }
3470 TokenType::BlockSequenceStart => {
3471 if depth == 0 {
3472 // The innermost open block is a sequence, not a mapping.
3473 return false;
3474 }
3475 depth -= 1;
3476 }
3477 TokenType::BlockEnd => {
3478 depth += 1;
3479 }
3480 TokenType::StreamStart | TokenType::DocumentStart | TokenType::DocumentEnd => {
3481 // Stop at document boundaries
3482 break;
3483 }
3484 _ => {}
3485 }
3486 }
3487
3488 false
3489 }
3490}
3491
3492impl Scanner for BasicScanner {
3493 fn check_token(&self) -> bool {
3494 // For lazy scanning: check if we have cached tokens or can generate more
3495 self.token_index < self.tokens.len() || !self.done
3496 }
3497
3498 fn peek_token(&self) -> Result<Option<&Token>> {
3499 // This is a bit tricky with lazy scanning since peek shouldn't mutate
3500 // For now, return cached token if available
3501 Ok(self.tokens.get(self.token_index))
3502 }
3503
3504 fn get_token(&mut self) -> Result<Option<Token>> {
3505 // If we need more tokens and haven't finished, scan next token
3506 if self.token_index >= self.tokens.len() && !self.done {
3507 self.scan_next_token()?;
3508 }
3509
3510 if self.token_index < self.tokens.len() {
3511 let token = self.tokens[self.token_index].clone();
3512 self.token_index += 1;
3513 Ok(Some(token))
3514 } else {
3515 Ok(None)
3516 }
3517 }
3518
3519 fn reset(&mut self) {
3520 self.token_index = 0;
3521 self.position = Position::start();
3522 self.tokens.clear();
3523 self.done = false;
3524 self.current_char = self.input.chars().next();
3525 self.indent_stack = vec![0];
3526 self.current_indent = 0;
3527 self.flow_level = 0;
3528 self.detected_indent_style = None;
3529 self.indent_samples.clear();
3530 self.previous_indent_level = 0;
3531 self.current_char_index = 0;
3532 self.current_char = self.char_cache.first().copied();
3533 }
3534
3535 fn position(&self) -> Position {
3536 self.position
3537 }
3538
3539 fn input(&self) -> &str {
3540 &self.input
3541 }
3542}
3543
3544#[cfg(test)]
3545mod tests {
3546 use super::*;
3547
3548 /// Drive the parser pipeline on `input` in a dedicated thread, returning
3549 /// `None` if it doesn't finish within `Duration::from_secs(2)`. Used by
3550 /// regression tests for parser hangs so a still-broken parser doesn't
3551 /// block the whole `cargo test` run.
3552 fn parse_with_timeout(input: &str) -> Option<Vec<crate::parser::Event>> {
3553 use crate::parser::{BasicParser, Parser as ParserTrait};
3554 use std::sync::mpsc;
3555 use std::thread;
3556 use std::time::Duration;
3557
3558 let owned = input.to_string();
3559 let (tx, rx) = mpsc::channel();
3560 thread::spawn(move || {
3561 let mut p = BasicParser::new_eager(owned);
3562 let _ = p.take_scanning_error();
3563 let mut events = Vec::new();
3564 loop {
3565 match p.get_event() {
3566 Ok(Some(ev)) => events.push(ev),
3567 Ok(None) => break,
3568 Err(_) => break,
3569 }
3570 }
3571 let _ = tx.send(events);
3572 });
3573 rx.recv_timeout(Duration::from_secs(2)).ok()
3574 }
3575
3576 /// Regression: `---` directly followed by non-space text used to spin the
3577 /// scanner forever because the `-` match arm at line-start dispatched to
3578 /// `scan_document_start` (which correctly returned None) and then to
3579 /// `is_plain_scalar_start` (which returns false for `-`, so no consumption
3580 /// occurred — outer `while let` re-entered with the same char). Fix:
3581 /// fall through to `scan_plain_scalar` unconditionally when not a doc
3582 /// marker — the guard already ensures the char is non-whitespace.
3583 /// See yaml-test-suite tests 82AN / EXG3.
3584 #[test]
3585 fn three_dashes_directly_followed_by_text_does_not_hang() {
3586 let events = parse_with_timeout("---word1\nword2\n")
3587 .expect("parser hung — `---word1` should not produce an infinite loop");
3588 // We must produce at least one scalar whose value starts with `---`,
3589 // proving that the dashes were consumed as part of a plain scalar
3590 // (not interpreted as a document marker, which would consume them
3591 // separately).
3592 let starts_with_dashes = events.iter().any(|e| {
3593 matches!(&e.event_type,
3594 crate::parser::EventType::Scalar { value, .. } if value.starts_with("---")
3595 )
3596 });
3597 assert!(
3598 starts_with_dashes,
3599 "expected a plain scalar starting with `---`, got events: {events:?}"
3600 );
3601 }
3602
3603 /// YAML 1.2 §7.3.3: `?`, `:`, and `-` may start a plain scalar provided
3604 /// the next character is non-space (and, in flow context, not a flow
3605 /// indicator). The previous `is_plain_scalar_start` unconditionally
3606 /// rejected those three characters, so plain scalars like `?foo`,
3607 /// `:foo`, `-foo` were reported as `Invalid character`.
3608 /// Tracked by yaml-test-suite 2EBW.
3609 #[test]
3610 fn question_mark_followed_by_text_starts_plain_scalar() {
3611 use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3612 let mut p = BasicParser::new_eager("?foo: bar\n".to_string());
3613 assert!(p.take_scanning_error().is_none());
3614 let mut keys = Vec::new();
3615 while let Ok(Some(ev)) = p.get_event() {
3616 if let EventType::Scalar { value, .. } = ev.event_type {
3617 keys.push(value);
3618 }
3619 }
3620 assert_eq!(keys, vec!["?foo", "bar"]);
3621 }
3622
3623 #[test]
3624 fn colon_followed_by_text_starts_plain_scalar() {
3625 use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3626 let mut p = BasicParser::new_eager(":foo: bar\n".to_string());
3627 assert!(p.take_scanning_error().is_none());
3628 let mut keys = Vec::new();
3629 while let Ok(Some(ev)) = p.get_event() {
3630 if let EventType::Scalar { value, .. } = ev.event_type {
3631 keys.push(value);
3632 }
3633 }
3634 assert_eq!(keys, vec![":foo", "bar"]);
3635 }
3636
3637 /// YAML 1.2: every started document must be closed with a DocumentEnd
3638 /// event before StreamEnd. The previous `TokenType::StreamEnd` handler
3639 /// only emitted `-DOC` for `DocumentContent` / `BlockNode` states —
3640 /// the `DocumentStart` state (entered after `---` and a single scalar
3641 /// like `"foo"`) was skipped, dropping the `-DOC` event. Affected by
3642 /// yaml-test-suite 27NA, 2G84/*, 2LFX and several others.
3643 #[test]
3644 fn explicit_doc_with_only_a_scalar_emits_doc_end_before_stream_end() {
3645 use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3646 let mut p = BasicParser::new_eager("---\n\"foo\"\n".to_string());
3647 assert!(p.take_scanning_error().is_none());
3648 let mut kinds = Vec::new();
3649 while let Ok(Some(ev)) = p.get_event() {
3650 kinds.push(match ev.event_type {
3651 EventType::StreamStart => "+STR",
3652 EventType::StreamEnd => "-STR",
3653 EventType::DocumentStart { .. } => "+DOC",
3654 EventType::DocumentEnd { .. } => "-DOC",
3655 EventType::Scalar { .. } => "=VAL",
3656 _ => "?",
3657 });
3658 }
3659 // Critical: -DOC must come before -STR.
3660 let doc_end_idx = kinds.iter().position(|s| *s == "-DOC");
3661 let str_end_idx = kinds.iter().position(|s| *s == "-STR");
3662 assert!(
3663 doc_end_idx.is_some(),
3664 "missing -DOC in event stream: {kinds:?}"
3665 );
3666 assert!(
3667 doc_end_idx < str_end_idx,
3668 "expected -DOC before -STR, got {kinds:?}"
3669 );
3670 }
3671
3672 /// YAML 1.2 §5.7 hex / Unicode escapes in double-quoted strings.
3673 #[test]
3674 fn double_quoted_hex_escapes_decode_to_codepoint() {
3675 use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3676 for (input, expected) in [
3677 (r#""\x41""#, "A"),
3678 (r#""é""#, "é"),
3679 (r#""\U0001F600""#, "\u{1f600}"),
3680 ] {
3681 let mut p = BasicParser::new_eager(input.to_string());
3682 assert!(
3683 p.take_scanning_error().is_none(),
3684 "no scan error for {input}"
3685 );
3686 let mut found = None;
3687 while let Ok(Some(ev)) = p.get_event() {
3688 if let EventType::Scalar { value, .. } = ev.event_type {
3689 found = Some(value);
3690 break;
3691 }
3692 }
3693 assert_eq!(found.as_deref(), Some(expected), "input {input}");
3694 }
3695 }
3696
3697 #[test]
3698 fn truncated_hex_escape_is_a_scan_error() {
3699 use crate::parser::BasicParser;
3700 let mut p = BasicParser::new_eager(r#""\x4""#.to_string());
3701 assert!(
3702 p.take_scanning_error().is_some(),
3703 "truncated \\x escape must error"
3704 );
3705 }
3706
3707 /// YAML 1.2 §5.7: double-quoted strings have a strict allowlist of escape
3708 /// sequences. `\.` (and any other unknown escape) must be reported as a
3709 /// scan error. Tracked by yaml-test-suite 55WF.
3710 #[test]
3711 fn invalid_double_quoted_escape_is_a_scan_error() {
3712 use crate::parser::{BasicParser, Parser as ParserTrait};
3713 let mut p = BasicParser::new_eager("---\n\"\\.\"\n".to_string());
3714 let scan_err = p.take_scanning_error();
3715 let mut parse_err = false;
3716 if scan_err.is_none() {
3717 loop {
3718 match p.get_event() {
3719 Ok(Some(_)) => {}
3720 Ok(None) => break,
3721 Err(_) => {
3722 parse_err = true;
3723 break;
3724 }
3725 }
3726 }
3727 }
3728 assert!(
3729 scan_err.is_some() || parse_err,
3730 "`\\.` is not a valid double-quoted escape and must error"
3731 );
3732 }
3733
3734 /// YAML 1.2: a complex-key marker (`?`) is the first content after an
3735 /// explicit document start (`---`) — it should open an implicit block
3736 /// mapping. The previous parser handled `?` only in
3737 /// `ImplicitDocumentStart` / `DocumentContent` / already-in-mapping
3738 /// states and errored out for `DocumentStart`, breaking inputs like
3739 /// `--- !!set\n? Mark McGwire\n...`. Tracked by yaml-test-suite 2XXW.
3740 #[test]
3741 fn complex_key_directly_after_explicit_doc_start_opens_mapping() {
3742 use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3743 let mut p = BasicParser::new_eager("--- !!set\n? Mark McGwire\n? Sammy Sosa\n".to_string());
3744 assert!(p.take_scanning_error().is_none());
3745 let mut saw_map_start = false;
3746 let mut saw_error = false;
3747 loop {
3748 match p.get_event() {
3749 Ok(Some(ev)) => {
3750 if matches!(ev.event_type, EventType::MappingStart { .. }) {
3751 saw_map_start = true;
3752 }
3753 }
3754 Ok(None) => break,
3755 Err(_) => {
3756 saw_error = true;
3757 break;
3758 }
3759 }
3760 }
3761 assert!(!saw_error, "complex key after `--- !!set` must not error");
3762 assert!(saw_map_start, "expected a MappingStart event");
3763 }
3764
3765 /// YAML 1.2 §6.9.2: anchor / alias names exclude only whitespace and
3766 /// the flow indicators `,[]{}`. Earlier implementations restricted
3767 /// `scan_identifier` to ASCII alphanumeric / `_` / `-`, which rejected
3768 /// valid unicode anchors like `&😁`. Tracked by yaml-test-suite 8XYN.
3769 #[test]
3770 fn anchor_name_may_contain_unicode_symbols() {
3771 use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3772 let mut p = BasicParser::new_eager("---\n- &😁 unicode anchor\n".to_string());
3773 assert!(
3774 p.take_scanning_error().is_none(),
3775 "unicode anchor must not error"
3776 );
3777 let mut anchors = Vec::new();
3778 while let Ok(Some(ev)) = p.get_event() {
3779 if let EventType::Scalar {
3780 anchor: Some(a), ..
3781 } = ev.event_type
3782 {
3783 anchors.push(a);
3784 }
3785 }
3786 assert_eq!(anchors, vec!["😁"]);
3787 }
3788
3789 /// YAML 1.2 §5.6 / RFC 3986 percent-encoding: tag suffixes may contain
3790 /// `%XX` percent-escaped characters, which must be URI-decoded when
3791 /// resolved. The scanner used to reject `%` in tag suffixes as
3792 /// "Invalid character", so e.g. `!e!tag%21 baz` failed before the
3793 /// resolver got a chance to decode it. Tracked by yaml-test-suite 6CK3.
3794 #[test]
3795 fn tag_suffix_with_percent_escape_resolves_to_decoded_uri() {
3796 use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3797 let mut p = BasicParser::new_eager(
3798 "%TAG !e! tag:example.com,2000:app/\n---\n- !e!tag%21 baz\n".to_string(),
3799 );
3800 assert!(
3801 p.take_scanning_error().is_none(),
3802 "tag percent-escapes must not error"
3803 );
3804 let mut tags = Vec::new();
3805 while let Ok(Some(ev)) = p.get_event() {
3806 if let EventType::Scalar { tag: Some(t), .. } = ev.event_type {
3807 tags.push(t);
3808 }
3809 }
3810 assert_eq!(tags, vec!["tag:example.com,2000:app/tag!"]);
3811 }
3812
3813 /// YAML 1.2 §6.8.4: "A YAML processor should ignore any directive it
3814 /// does not recognize." A `%FOO` reserved directive must NOT be treated
3815 /// as a scan error — the directive line is silently skipped and parsing
3816 /// continues. Tracked by yaml-test-suite test 2LFX.
3817 #[test]
3818 fn reserved_directive_is_ignored_not_an_error() {
3819 use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3820 let mut p = BasicParser::new_eager(
3821 "%FOO bar baz # Should be ignored\n # with a warning.\n---\n\"foo\"\n"
3822 .to_string(),
3823 );
3824 assert!(
3825 p.take_scanning_error().is_none(),
3826 "unknown directives must NOT produce a scan error"
3827 );
3828 let mut scalars = Vec::new();
3829 while let Ok(Some(ev)) = p.get_event() {
3830 if let EventType::Scalar { value, .. } = ev.event_type {
3831 scalars.push(value);
3832 }
3833 }
3834 assert_eq!(scalars, vec!["foo"]);
3835 }
3836
3837 /// Spec requires the two physical lines of `---word1\nword2` to fold into
3838 /// a single plain scalar `"---word1 word2"`. Tracked by yaml-test-suite 82AN.
3839 #[test]
3840 fn three_dashes_followed_by_text_folds_continuation_line() {
3841 let events = parse_with_timeout("---word1\nword2\n").expect("parser hung");
3842 let scalars: Vec<&str> = events
3843 .iter()
3844 .filter_map(|e| match &e.event_type {
3845 crate::parser::EventType::Scalar { value, .. } => Some(value.as_str()),
3846 _ => None,
3847 })
3848 .collect();
3849 assert_eq!(scalars, vec!["---word1 word2"]);
3850 }
3851
3852 /// Regression: tab between block-entry marker and a `-N` value used to
3853 /// hang the scanner via the same `-` match arm. See yaml-test-suite
3854 /// Y79Y/010.
3855 #[test]
3856 fn dash_tab_negative_number_does_not_hang() {
3857 let events = parse_with_timeout("-\t-1\n")
3858 .expect("parser hung — `-\\t-1` should not produce an infinite loop");
3859 assert!(!events.is_empty(), "expected event stream, got none");
3860 }
3861
3862 #[test]
3863 fn test_basic_tokenization() {
3864 let mut scanner = BasicScanner::new("42".to_string());
3865
3866 assert!(scanner.check_token());
3867
3868 // StreamStart
3869 let token = scanner.get_token().unwrap().unwrap();
3870 assert!(matches!(token.token_type, TokenType::StreamStart));
3871
3872 // Number
3873 let token = scanner.get_token().unwrap().unwrap();
3874 if let TokenType::Scalar(value, _) = token.token_type {
3875 assert_eq!(value, "42");
3876 } else {
3877 panic!("Expected scalar token");
3878 }
3879
3880 // StreamEnd
3881 let token = scanner.get_token().unwrap().unwrap();
3882 assert!(matches!(token.token_type, TokenType::StreamEnd));
3883 }
3884
3885 #[test]
3886 fn test_flow_sequence() {
3887 let mut scanner = BasicScanner::new("[1, 2, 3]".to_string());
3888
3889 // StreamStart
3890 scanner.get_token().unwrap();
3891
3892 // [
3893 let token = scanner.get_token().unwrap().unwrap();
3894 assert!(matches!(token.token_type, TokenType::FlowSequenceStart));
3895
3896 // 1
3897 let token = scanner.get_token().unwrap().unwrap();
3898 if let TokenType::Scalar(value, _) = token.token_type {
3899 assert_eq!(value, "1");
3900 }
3901
3902 // ,
3903 let token = scanner.get_token().unwrap().unwrap();
3904 assert!(matches!(token.token_type, TokenType::FlowEntry));
3905 }
3906
3907 #[test]
3908 fn test_quoted_strings() {
3909 let mut scanner = BasicScanner::new(r#""hello world""#.to_string());
3910
3911 // StreamStart
3912 scanner.get_token().unwrap();
3913
3914 // Quoted string
3915 let token = scanner.get_token().unwrap().unwrap();
3916 if let TokenType::Scalar(value, _) = token.token_type {
3917 assert_eq!(value, "hello world");
3918 } else {
3919 panic!("Expected scalar token");
3920 }
3921 }
3922
3923 #[test]
3924 fn test_comment_handling() {
3925 let input = r"
3926# Full line comment
3927key: value # End of line comment
3928# Another comment
3929data: test
3930";
3931 let mut scanner = BasicScanner::new(input.to_string());
3932
3933 let mut tokens = Vec::new();
3934 while let Ok(Some(token)) = scanner.get_token() {
3935 tokens.push(token);
3936 }
3937
3938 // Should only contain YAML structure tokens, no comment tokens
3939 let scalar_values: Vec<String> = tokens
3940 .iter()
3941 .filter_map(|t| match &t.token_type {
3942 TokenType::Scalar(s, _) => Some(s.clone()),
3943 _ => None,
3944 })
3945 .collect();
3946
3947 assert_eq!(scalar_values, vec!["key", "value", "data", "test"]);
3948
3949 // Should not contain any comment tokens
3950 assert!(
3951 !tokens
3952 .iter()
3953 .any(|t| matches!(t.token_type, TokenType::Comment(_)))
3954 );
3955 }
3956
3957 #[test]
3958 fn test_hash_in_strings() {
3959 let input = r#"
3960string1: "This has a # character"
3961string2: 'Also has # character'
3962normal: value # This is a comment
3963"#;
3964 let mut scanner = BasicScanner::new(input.to_string());
3965
3966 let mut scalar_values = Vec::new();
3967 while let Ok(Some(token)) = scanner.get_token() {
3968 if let TokenType::Scalar(value, _) = token.token_type {
3969 scalar_values.push(value);
3970 }
3971 }
3972
3973 assert!(scalar_values.contains(&"This has a # character".to_string()));
3974 assert!(scalar_values.contains(&"Also has # character".to_string()));
3975 assert!(scalar_values.contains(&"value".to_string()));
3976 assert!(
3977 !scalar_values
3978 .iter()
3979 .any(|s| s.contains("This is a comment"))
3980 );
3981 }
3982
3983 #[test]
3984 fn test_escape_sequences() {
3985 // YAML 1.2 §5.7 double-quoted escape sequences. Single-quoted strings
3986 // have NO backslash escapes — `''` is the only escape — so this set
3987 // is restricted to the double-quoted cases.
3988 let test_cases = vec![
3989 (r#""Line 1\nLine 2""#, "Line 1\nLine 2"),
3990 (r#""Col1\tCol2""#, "Col1\tCol2"),
3991 (r#""First\rSecond""#, "First\rSecond"),
3992 (r#""Path\\to\\file""#, "Path\\to\\file"),
3993 (r#""He said \"Hello\"""#, "He said \"Hello\""),
3994 ];
3995
3996 for (input, expected) in test_cases {
3997 let mut scanner = BasicScanner::new(input.to_string());
3998 scanner.get_token().unwrap(); // Skip StreamStart
3999
4000 if let Ok(Some(token)) = scanner.get_token() {
4001 if let TokenType::Scalar(value, _) = token.token_type {
4002 assert_eq!(value, expected, "Failed for input: {}", input);
4003 } else {
4004 panic!("Expected scalar token for input: {}", input);
4005 }
4006 } else {
4007 panic!("Failed to get token for input: {}", input);
4008 }
4009 }
4010 }
4011
4012 #[test]
4013 fn test_extended_yaml_escapes() {
4014 // Test additional YAML escape sequences
4015 let test_cases = vec![
4016 (r#""\0""#, "\0"), // null character
4017 (r#""\a""#, "\x07"), // bell
4018 (r#""\b""#, "\x08"), // backspace
4019 (r#""\f""#, "\x0C"), // form feed
4020 (r#""\v""#, "\x0B"), // vertical tab
4021 (r#""\e""#, "\x1B"), // escape
4022 (r#""\ ""#, " "), // literal space
4023 (r#""\/""#, "/"), // literal forward slash
4024 ];
4025
4026 for (input, expected) in test_cases {
4027 let mut scanner = BasicScanner::new(input.to_string());
4028 scanner.get_token().unwrap(); // Skip StreamStart
4029
4030 if let Ok(Some(token)) = scanner.get_token() {
4031 if let TokenType::Scalar(value, _) = token.token_type {
4032 assert_eq!(value, expected, "Failed for input: {}", input);
4033 } else {
4034 panic!("Expected scalar token for input: {}", input);
4035 }
4036 } else {
4037 panic!("Failed to get token for input: {}", input);
4038 }
4039 }
4040 }
4041
4042 #[test]
4043 fn test_unknown_escape_sequences() {
4044 // YAML 1.2 §5.7: unknown double-quoted escapes are scan errors, not
4045 // preserved literals. (Earlier versions of this scanner kept the
4046 // backslash + char verbatim — see commit history.)
4047 for input in [r#""\z""#, r#""\q""#, r#""\8""#] {
4048 let mut scanner = BasicScanner::new(input.to_string());
4049 scanner.get_token().unwrap(); // StreamStart
4050 assert!(
4051 scanner.get_token().is_err(),
4052 "expected scan error for invalid escape in {input}"
4053 );
4054 }
4055 }
4056}