rust_yaml/scanner/mod.rs
1//! YAML scanner for tokenization
2
3use crate::{Error, Limits, Position, ResourceTracker, Result, error::ErrorContext};
4
5pub mod indentation;
6pub mod scalar_scanner;
7pub mod state;
8pub mod token_processor;
9pub mod tokens;
10// pub mod optimizations; // Temporarily disabled
11pub use scalar_scanner::ScalarScanner;
12pub use tokens::*;
13// pub use optimizations::*;
14
15/// Trait for YAML scanners that convert character streams to tokens
16pub trait Scanner {
17 /// Check if there are more tokens available
18 fn check_token(&self) -> bool;
19
20 /// Peek at the next token without consuming it
21 fn peek_token(&self) -> Result<Option<&Token>>;
22
23 /// Get the next token, consuming it
24 fn get_token(&mut self) -> Result<Option<Token>>;
25
26 /// Reset the scanner state
27 fn reset(&mut self);
28
29 /// Get the current position in the input
30 fn position(&self) -> Position;
31
32 /// Get the input text for error reporting
33 fn input(&self) -> &str;
34}
35
36/// Block-scalar chomping mode per YAML 1.2 §8.1.1.2.
37///
38/// - `Strip` (`-`): drop the final line break and trailing empty lines.
39/// - `Clip` (default): keep exactly one final line break, drop trailing empty lines.
40/// - `Keep` (`+`): preserve the final line break and all trailing empty lines.
41#[derive(Debug, Clone, Copy, PartialEq, Eq)]
42enum ChompingMode {
43 Strip,
44 Clip,
45 Keep,
46}
47
48/// Apply chomping mode to a block-scalar tail.
49///
50/// The collectors emit a `\n` for every line (content or blank). This helper
51/// trims that tail according to spec §8.1.1.2:
52///
53/// - **Strip:** remove every trailing `\n`.
54/// - **Clip:** keep exactly one trailing `\n` if content exists; drop the rest.
55/// Empty input stays empty.
56/// - **Keep:** preserve everything.
57fn apply_chomping(mut s: String, mode: ChompingMode) -> String {
58 match mode {
59 ChompingMode::Keep => s,
60 ChompingMode::Strip => {
61 while s.ends_with('\n') {
62 s.pop();
63 }
64 s
65 }
66 ChompingMode::Clip => {
67 // Strip trailing newlines. If anything remains, restore one.
68 // §8.1.1.2: clip keeps the final line break only when the
69 // scalar has actual content (yaml-test-suite K858: an empty
70 // clip scalar `>` is `""`, not `"\n"`).
71 while s.ends_with('\n') {
72 s.pop();
73 }
74 if !s.is_empty() {
75 s.push('\n');
76 }
77 s
78 }
79 }
80}
81
82/// A basic scanner implementation for YAML tokenization
83#[derive(Debug)]
84#[allow(dead_code)]
85pub struct BasicScanner {
86 input: String,
87 position: Position,
88 current_char: Option<char>,
89 tokens: Vec<Token>,
90 token_index: usize,
91 done: bool,
92 indent_stack: Vec<usize>,
93 current_indent: usize,
94 allow_simple_key: bool,
95 simple_key_allowed: bool,
96 flow_level: usize,
97 preserve_comments: bool,
98 // Indentation style detection
99 detected_indent_style: Option<crate::value::IndentStyle>,
100 indent_samples: Vec<(usize, bool)>, // (size, is_tabs)
101 previous_indent_level: usize, // Track the previous indentation for style detection
102 // Performance optimizations
103 buffer: String, // Reusable string buffer for token values
104 char_cache: Vec<char>, // Cached characters for faster access
105 current_char_index: usize, // Current index in char_cache
106 profiler: Option<crate::profiling::YamlProfiler>, // Optional profiling
107 // Error tracking
108 scanning_error: Option<Error>, // Store scanning errors for later retrieval
109 // Resource tracking
110 limits: Limits,
111 resource_tracker: ResourceTracker,
112 // Track inline nested sequences that need closing
113 inline_sequence_depth: usize,
114 // Track compact-notation sequences (where `-` is at the same indent as
115 // the parent mapping keys). These are NOT on indent_stack, so we need
116 // separate tracking to know when to emit BlockEnd for them.
117 compact_sequence_indents: Vec<usize>,
118 // Parallel to indent_stack: true when the entry was pushed by a block
119 // sequence, false when by a mapping. Lets us distinguish "continuing a
120 // regular sequence" from "starting a compact sequence at same indent".
121 indent_is_sequence: Vec<bool>,
122}
123
124impl BasicScanner {
125 /// Create a new scanner from input string
126 pub fn new(input: String) -> Self {
127 Self::with_limits(input, Limits::default())
128 }
129
130 /// Create a new scanner with custom resource limits
131 pub fn with_limits(input: String, limits: Limits) -> Self {
132 let char_cache: Vec<char> = input.chars().collect();
133 let current_char = char_cache.first().copied();
134
135 // Track document size for resource limits
136 let mut resource_tracker = ResourceTracker::new();
137 if let Err(e) = resource_tracker.add_bytes(&limits, input.len()) {
138 // If the input is too large, create scanner with error state
139 return Self {
140 current_char: None,
141 input,
142 position: Position::start(),
143 tokens: Vec::new(),
144 token_index: 0,
145 done: true,
146 indent_stack: vec![0],
147 current_indent: 0,
148 allow_simple_key: false,
149 simple_key_allowed: false,
150 flow_level: 0,
151 preserve_comments: false,
152 detected_indent_style: None,
153 indent_samples: Vec::new(),
154 previous_indent_level: 0,
155 buffer: String::new(),
156 char_cache: Vec::new(),
157 current_char_index: 0,
158 profiler: None,
159 scanning_error: Some(e),
160 limits,
161 resource_tracker,
162 inline_sequence_depth: 0,
163 compact_sequence_indents: Vec::new(),
164 indent_is_sequence: vec![false],
165 };
166 }
167
168 Self {
169 current_char,
170 input,
171 position: Position::start(),
172 tokens: Vec::new(),
173 token_index: 0,
174 done: false,
175 indent_stack: vec![0], // Always start with base indentation
176 current_indent: 0,
177 allow_simple_key: true,
178 simple_key_allowed: true,
179 flow_level: 0,
180 preserve_comments: false,
181 detected_indent_style: None,
182 indent_samples: Vec::new(),
183 previous_indent_level: 0,
184 buffer: String::with_capacity(64), // Pre-allocate buffer
185 char_cache,
186 current_char_index: 0,
187 profiler: std::env::var("RUST_YAML_PROFILE")
188 .ok()
189 .map(|_| crate::profiling::YamlProfiler::new()),
190 scanning_error: None,
191 limits,
192 resource_tracker,
193 inline_sequence_depth: 0,
194 compact_sequence_indents: Vec::new(),
195 indent_is_sequence: vec![false],
196 }
197 }
198
199 /// Create a new scanner with eager token scanning (for compatibility)
200 pub fn new_eager(input: String) -> Self {
201 Self::new_eager_with_limits(input, Limits::default())
202 }
203
204 /// Create a new scanner with eager token scanning and custom limits
205 pub fn new_eager_with_limits(input: String, limits: Limits) -> Self {
206 let mut scanner = Self::with_limits(input, limits);
207 // Store any scanning errors for later retrieval
208 if let Err(error) = scanner.scan_all_tokens() {
209 scanner.scanning_error = Some(error);
210 }
211 scanner
212 }
213
214 /// Create a new scanner with comment preservation enabled
215 pub fn new_with_comments(input: String) -> Self {
216 let mut scanner = Self::new(input);
217 scanner.preserve_comments = true;
218 scanner
219 }
220
221 /// Create a new scanner with comments and custom limits
222 pub fn new_with_comments_and_limits(input: String, limits: Limits) -> Self {
223 let mut scanner = Self::with_limits(input, limits);
224 scanner.preserve_comments = true;
225 scanner
226 }
227
228 /// Create a new scanner with eager scanning and comment preservation
229 pub fn new_eager_with_comments(input: String) -> Self {
230 let mut scanner = Self::new_with_comments(input);
231 // Mirror `new_eager_with_limits`: record scanning errors instead
232 // of discarding them (#19). Previously this used
233 // `unwrap_or(())`, silently truncating the token stream and
234 // returning a scanner whose `has_scanning_error()` reported
235 // false — silent data loss for comment-preserving callers.
236 if let Err(error) = scanner.scan_all_tokens() {
237 scanner.scanning_error = Some(error);
238 }
239 scanner
240 }
241
242 /// Get the detected indentation style from the document
243 pub const fn detected_indent_style(&self) -> Option<&crate::value::IndentStyle> {
244 self.detected_indent_style.as_ref()
245 }
246
247 /// Check if there was a scanning error
248 pub const fn has_scanning_error(&self) -> bool {
249 self.scanning_error.is_some()
250 }
251
252 /// Get the scanning error if any
253 #[allow(clippy::missing_const_for_fn)]
254 pub fn take_scanning_error(&mut self) -> Option<Error> {
255 self.scanning_error.take()
256 }
257
258 /// Advance to the next character
259 fn advance(&mut self) -> Option<char> {
260 if let Some(ch) = self.current_char {
261 self.position = self.position.advance(ch);
262 self.current_char_index += 1;
263
264 if self.current_char_index < self.char_cache.len() {
265 self.current_char = Some(self.char_cache[self.current_char_index]);
266 } else {
267 self.current_char = None;
268 }
269 }
270
271 self.current_char
272 }
273
274 /// Skip whitespace characters (excluding newlines)
275 fn skip_whitespace(&mut self) {
276 while let Some(ch) = self.current_char {
277 if ch == ' ' || ch == '\t' {
278 self.advance();
279 } else {
280 break;
281 }
282 }
283 }
284
285 /// Handle indentation and produce block tokens if necessary
286 fn handle_indentation(&mut self) -> Result<()> {
287 // In flow context: if there is a non-trivial enclosing block
288 // (indent_stack has more than the implicit root level), each
289 // continuation line that has content must be indented MORE than
290 // that enclosing block's indent. \`flow: [a,\\nb,c]\` with \`b\`
291 // at col 1 violates this rule because the block mapping enclosing
292 // \`flow:\` sits at indent 0 (yaml-test-suite 9C9N).
293 //
294 // Top-level flow (no enclosing block; indent_stack is just \[0\])
295 // is exempt — `[a,\\nb]` is fine there because the flow content
296 // isn't nested inside any block (yaml-test-suite 4ZYM).
297 if self.flow_level > 0 {
298 if self.indent_stack.len() > 1 || !self.compact_sequence_indents.is_empty() {
299 let mut probe = 0usize;
300 let mut i = self.current_char_index;
301 while i < self.char_cache.len() {
302 match self.char_cache[i] {
303 ' ' => {
304 probe += 1;
305 i += 1;
306 }
307 '\t' => i += 1,
308 _ => break,
309 }
310 }
311 let has_content = self
312 .char_cache
313 .get(i)
314 .map_or(false, |c| !matches!(c, '\n' | '\r'));
315 // A line that begins with the matching flow closer
316 // (\`]\` / \`}\`) is allowed at the parent indent — it
317 // closes the flow collection, not adds content
318 // (yaml-test-suite NKF9 trailing-line \`}\` at col 1).
319 let is_closer = matches!(self.char_cache.get(i).copied(), Some(']' | '}'));
320 if has_content && !is_closer {
321 let parent_indent = self.indent_stack.last().copied().unwrap_or(0);
322 if probe <= parent_indent {
323 return Err(Error::scan(
324 self.position,
325 "Flow content line is not indented enough".to_string(),
326 ));
327 }
328 }
329 }
330 return Ok(());
331 }
332
333 let line_start_pos = self.position;
334 let mut indent = 0;
335 let mut has_tabs = false;
336 let mut has_spaces = false;
337 let _indent_start_pos = self.position;
338
339 // Count indentation and detect style
340 while let Some(ch) = self.current_char {
341 if ch == ' ' {
342 indent += 1;
343 has_spaces = true;
344 self.advance();
345 } else if ch == '\t' {
346 indent += 8; // Tab counts as 8 spaces for indentation calculation
347 has_tabs = true;
348 self.advance();
349 } else {
350 break;
351 }
352 }
353
354 // Analyze indentation pattern for style detection
355 // Only analyze if there's actual content after the indentation (not just whitespace)
356 if indent > 0
357 && self.current_char.is_some()
358 && !matches!(self.current_char, Some('\n' | '\r'))
359 {
360 self.analyze_indentation_pattern(indent, has_tabs, has_spaces)?;
361 }
362
363 // YAML 1.2 §6.1 does NOT require all indents to be multiples
364 // of a single "indent width". Siblings must share a column;
365 // children must indent further; but any positive amount works
366 // (e.g. `key:\n child:\n grandchild:` with widths 2, 1
367 // is legal). The earlier strict-multiple-of-N check rejected
368 // valid spec fixtures like 6HB6, 8G76, A2M4, P94K, Q9WF,
369 // UGM3. We rely on the indent_stack-driven open/close logic
370 // (and the per-block "more than parent" rule enforced
371 // elsewhere) to catch genuine mis-indentation.
372
373 // Update previous indentation level for future comparisons
374 if indent > 0 {
375 self.previous_indent_level = indent;
376 }
377
378 // Update current indentation level
379 self.current_indent = indent;
380
381 // Close compact-notation sequences whose scope ends at this line.
382 // A compact sequence (where `-` shares the indent of the parent
383 // mapping keys) ends when the next content line at that indent is
384 // NOT a block entry (`- `). We must emit the sequence's BlockEnd
385 // BEFORE popping the indent_stack so that the nesting order is
386 // correct (sequence closes before its parent mapping).
387 let has_content =
388 self.current_char.is_some() && !matches!(self.current_char, Some('\n' | '\r' | '#'));
389 if has_content {
390 let is_block_entry = self.current_char == Some('-')
391 && self.peek_char(1).map_or(true, |c| c.is_whitespace());
392 while let Some(&seq_indent) = self.compact_sequence_indents.last() {
393 if indent < seq_indent || (indent == seq_indent && !is_block_entry) {
394 self.compact_sequence_indents.pop();
395 self.tokens
396 .push(Token::simple(TokenType::BlockEnd, line_start_pos));
397 } else {
398 break;
399 }
400 }
401 }
402
403 // Check if we need to emit block end tokens for decreased indentation
404 let pre_pop_top = self.indent_stack.last().copied().unwrap_or(0);
405 while let Some(&last_indent) = self.indent_stack.last() {
406 if indent < last_indent && last_indent > 0 {
407 self.indent_stack.pop();
408 self.indent_is_sequence.pop();
409 self.tokens
410 .push(Token::simple(TokenType::BlockEnd, line_start_pos));
411 } else {
412 break;
413 }
414 }
415
416 // §6.1: after a dedent, the new line's indent must match some
417 // existing container level — keys/items at a sibling level
418 // must share a column. Landing at a column that is between
419 // two stack levels (e.g. parent at 0, just-closed at 3, new
420 // line at 1) is invalid because no open mapping/sequence sits
421 // at indent 1 (yaml-test-suite DMG6, N4JP).
422 //
423 // The check applies only when:
424 // * we actually dedented (pre-pop top was deeper than now),
425 // * the new line has content (the next char is not blank /
426 // newline / EOF / comment),
427 // * indent doesn't match the new top.
428 if pre_pop_top > 0
429 && pre_pop_top > self.indent_stack.last().copied().unwrap_or(0)
430 && self
431 .current_char
432 .map_or(false, |c| !matches!(c, '\n' | '\r' | '#'))
433 && indent != self.indent_stack.last().copied().unwrap_or(0)
434 {
435 // Allow if indent is a valid deeper level — e.g.
436 // sibling at depth then deeper child — but for the
437 // dedent path indent must equal a known stack level.
438 return Err(Error::scan(
439 self.position,
440 format!(
441 "Indentation {indent} doesn't match any open container (expected {} or deeper)",
442 self.indent_stack.last().copied().unwrap_or(0)
443 ),
444 ));
445 }
446
447 Ok(())
448 }
449
450 /// Analyze indentation pattern to detect the document's indentation style
451 fn analyze_indentation_pattern(
452 &mut self,
453 current_indent: usize,
454 has_tabs: bool,
455 has_spaces: bool,
456 ) -> Result<()> {
457 // Prevent mixed indentation (tabs + spaces on same line).
458 // Carve-out: a tab AFTER one or more spaces and BEFORE
459 // value-position content (not a key) is content-area
460 // whitespace, not indentation. \`foo:\\n \\tbar\` — the 1
461 // space is indent, the tab is a separator before \`bar\`
462 // which is the value of \`foo:\` (yaml-test-suite DK95/00).
463 if has_tabs && has_spaces {
464 // Peek ahead: if the content after the tab+spaces area
465 // contains a key marker (`: ` or `:`+EOL), treat as
466 // indentation (invalid). Otherwise it's a value line.
467 let looks_like_key = self.line_after_indent_is_implicit_key();
468 if looks_like_key {
469 let context =
470 crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
471 .with_suggestion(
472 "Use either tabs OR spaces for indentation, not both".to_string(),
473 );
474 return Err(Error::invalid_character_with_context(
475 self.position,
476 '\t',
477 "mixed indentation",
478 context,
479 ));
480 }
481 }
482 // §6.1: indentation must be space characters only. Pure-tab
483 // indentation (\`\\tkey: value\`) is invalid (yaml-test-suite
484 // 4EJS). Two carve-outs:
485 // * The mixed case is caught by the earlier branch.
486 // * Tabs before a flow-collection opener (\`\\t[\`, \`\\t{\`)
487 // at the root are not "block indentation" — there's no
488 // enclosing block — and yaml-test-suite 6CA3 / Q5MG accept
489 // them.
490 if has_tabs && !has_spaces && !matches!(self.current_char, Some('[' | '{')) {
491 let context = crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
492 .with_suggestion("Use space characters for indentation".to_string());
493 return Err(Error::invalid_character_with_context(
494 self.position,
495 '\t',
496 "indentation",
497 context,
498 ));
499 }
500
501 // If we detected tabs, check for mixed indentation across lines
502 if has_tabs {
503 match self.detected_indent_style {
504 None => {
505 // First time detecting indentation style - set to tabs
506 self.detected_indent_style = Some(crate::value::IndentStyle::Tabs);
507 }
508 Some(crate::value::IndentStyle::Spaces(_)) => {
509 // Previously detected spaces, now seeing tabs - mixed indentation error
510 let context =
511 crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
512 .with_suggestion(
513 "Use consistent indentation style throughout the document"
514 .to_string(),
515 );
516 return Err(Error::invalid_character_with_context(
517 self.position,
518 '\t',
519 "mixed indentation",
520 context,
521 ));
522 }
523 Some(crate::value::IndentStyle::Tabs) => {
524 // Already using tabs - this is consistent
525 }
526 }
527 return Ok(());
528 }
529
530 // For spaces, check for mixed indentation across lines first
531 if has_spaces {
532 // Check if we previously detected tabs
533 if matches!(
534 self.detected_indent_style,
535 Some(crate::value::IndentStyle::Tabs)
536 ) {
537 let context =
538 crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
539 .with_suggestion(
540 "Use consistent indentation style throughout the document".to_string(),
541 );
542 return Err(Error::invalid_character_with_context(
543 self.position,
544 ' ',
545 "mixed indentation",
546 context,
547 ));
548 }
549
550 // Calculate the indentation level difference
551 if current_indent > self.previous_indent_level {
552 let indent_diff = current_indent - self.previous_indent_level;
553
554 // Store this sample for analysis (but only meaningful differences)
555 if indent_diff > 0 && indent_diff <= 8 {
556 // Reasonable indentation range
557 self.indent_samples.push((indent_diff, false));
558
559 // Try to determine consistent indentation width
560 if self.detected_indent_style.is_none() {
561 self.detect_space_indentation_width();
562 }
563 }
564 }
565
566 // YAML 1.2 §6.1 does NOT require all indents to be multiples
567 // of a single "indent width". Sibling lines must share a
568 // column and children must indent deeper than parents, but
569 // any positive amount works. The "multiple of N" check
570 // rejected valid spec fixtures (6HB6, M5C3, P94K, Q9WF,
571 // RZP5, UGM3, XW4D, A2M4); we rely on the indent_stack
572 // open/close logic for genuine mis-indentation. The detected
573 // style is still recorded for later style-preservation use
574 // (e.g. emitter), it just no longer drives validation.
575 // self.validate_indentation_consistency(current_indent)?;
576 }
577
578 Ok(())
579 }
580
581 /// Detect the consistent space indentation width from samples
582 fn detect_space_indentation_width(&mut self) {
583 if self.indent_samples.is_empty() {
584 return; // Need at least 1 sample
585 }
586
587 // Find the most common indentation width
588 let mut width_counts = std::collections::HashMap::new();
589
590 for &(width, is_tabs) in &self.indent_samples {
591 if !is_tabs && width > 0 {
592 *width_counts.entry(width).or_insert(0) += 1;
593 }
594 }
595
596 // Find the most frequent width - be more aggressive and detect early
597 if let Some((&most_common_width, &_count)) =
598 width_counts.iter().max_by_key(|&(_, count)| count)
599 {
600 // Set on first consistent sample to enable stricter validation
601 self.detected_indent_style = Some(crate::value::IndentStyle::Spaces(most_common_width));
602 }
603 }
604
605 /// Check if the given indentation level is valid based on current context
606 #[allow(clippy::missing_const_for_fn)] // Cannot be const due to self.detected_indent_style access
607 fn is_valid_indentation_level(&self, indent: usize) -> bool {
608 // For now, allow any indentation that could represent valid nesting
609 // In the future, this could be made more strict by checking against
610 // the current indent_stack to ensure proper nesting
611 if let Some(crate::value::IndentStyle::Spaces(width)) = self.detected_indent_style {
612 // Must be a multiple of the detected width
613 indent % width == 0
614 } else {
615 // If no style detected yet, allow any indentation
616 true
617 }
618 }
619
620 /// Validate that current indentation is consistent with detected style
621 fn validate_indentation_consistency(&self, current_indent: usize) -> Result<()> {
622 if let Some(crate::value::IndentStyle::Spaces(width)) = self.detected_indent_style {
623 // Check if current indentation is a multiple of the detected width
624 if current_indent > 0 && current_indent % width != 0 {
625 let lower_level = (current_indent / width) * width;
626 let higher_level = lower_level + width;
627 let suggestion = format!(
628 "Expected indentation to be a multiple of {} spaces. Use {} or {} spaces instead of {}",
629 width, lower_level, higher_level, current_indent
630 );
631 let context =
632 crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
633 .with_suggestion(suggestion);
634 return Err(Error::indentation_with_context(
635 self.position,
636 (current_indent / width) * width, // expected (nearest valid level)
637 current_indent, // found
638 context,
639 ));
640 }
641 }
642 Ok(())
643 }
644
645 /// Check if current position starts a plain scalar
646 fn is_plain_scalar_start(&self) -> bool {
647 self.current_char.map_or(false, |ch| match ch {
648 // Pure indicators — never start a plain scalar.
649 ',' | '[' | ']' | '{' | '}' | '#' | '&' | '*' | '!' | '|' | '>' | '\'' | '"' | '%'
650 | '@' | '`' => false,
651 // YAML 1.2 §7.3.3: `?`, `:`, `-` may start a plain scalar when
652 // the next character is non-whitespace (and, in flow context,
653 // not a flow indicator). Otherwise they act as indicators
654 // (complex-key marker / value separator / block-entry marker).
655 '?' | ':' | '-' => match self.peek_char(1) {
656 None => false,
657 Some(c) if c.is_whitespace() => false,
658 Some(c) if self.flow_level > 0 && ",[]{}".contains(c) => false,
659 Some(_) => true,
660 },
661 _ => !ch.is_whitespace(),
662 })
663 }
664
665 /// Check if the value is a YAML boolean
666 fn is_yaml_bool(value: &str) -> bool {
667 matches!(
668 value,
669 "true"
670 | "false"
671 | "True"
672 | "False"
673 | "TRUE"
674 | "FALSE"
675 | "yes"
676 | "no"
677 | "Yes"
678 | "No"
679 | "YES"
680 | "NO"
681 | "on"
682 | "off"
683 | "On"
684 | "Off"
685 | "ON"
686 | "OFF"
687 )
688 }
689
690 /// Check if the value is a YAML null
691 fn is_yaml_null(value: &str) -> bool {
692 matches!(value, "null" | "Null" | "NULL" | "~" | "")
693 }
694
695 /// Normalize a scalar value based on YAML rules.
696 ///
697 /// The scanner preserves the original text of plain scalars. Type
698 /// resolution (including version-aware bool/null mapping) happens in
699 /// the composer (see `crate::resolver::resolve_plain_scalar`). This
700 /// preserves enough information for the composer to apply the
701 /// YAML 1.1 vs 1.2 distinction and for round-trip emitters to
702 /// recover the original spelling.
703 fn normalize_scalar(value: String) -> String {
704 value
705 }
706
707 /// Scan a number token
708 fn scan_number(&mut self) -> Result<Token> {
709 let start_pos = self.position;
710 let mut value = String::new();
711
712 // Handle negative numbers
713 if self.current_char == Some('-') {
714 value.push('-');
715 self.advance();
716 }
717
718 // Scan digits
719 while let Some(ch) = self.current_char {
720 if ch.is_ascii_digit() {
721 value.push(ch);
722 self.advance();
723 } else if ch == '.' {
724 value.push(ch);
725 self.advance();
726 // Scan fractional part
727 while let Some(ch) = self.current_char {
728 if ch.is_ascii_digit() {
729 value.push(ch);
730 self.advance();
731 } else {
732 break;
733 }
734 }
735 break;
736 } else {
737 break;
738 }
739 }
740
741 Ok(Token::new(
742 TokenType::Scalar(value, tokens::QuoteStyle::Plain),
743 start_pos,
744 self.position,
745 ))
746 }
747
748 /// Scan a plain scalar (unquoted string)
749 fn scan_plain_scalar(&mut self) -> Result<Token> {
750 let start_pos = self.position;
751 let start_col = start_pos.column;
752 let mut value = String::new();
753 let mut multi_line = false;
754
755 loop {
756 // Scan content on the current line until we hit a stop condition.
757 while let Some(ch) = self.current_char {
758 if self.flow_level == 0 {
759 match ch {
760 '\n' | '\r' => break,
761 ':' if self.peek_char(1).map_or(true, |c| c.is_whitespace()) => break,
762 '#' if value.is_empty()
763 || self.peek_char(-1).map_or(false, |c| c.is_whitespace()) =>
764 {
765 break;
766 }
767 _ => {}
768 }
769 } else {
770 match ch {
771 // Same line-break handling as block context: stop
772 // collecting raw content at `\n`/`\r`, then let the
773 // outer fold logic decide whether the next line
774 // continues this scalar (yaml-test-suite 8KB6,
775 // 8UDB, 9BXH).
776 '\n' | '\r' => break,
777 ',' | '[' | ']' | '{' | '}' => break,
778 // In flow context, `:` is a key-value separator
779 // when followed by whitespace OR any flow indicator
780 // (`,`, `[`, `]`, `{`, `}`). Tracked by yaml-test-
781 // suite FRK4 (`{ ? foo :, ... }`).
782 ':' if self
783 .peek_char(1)
784 .map_or(true, |c| c.is_whitespace() || ",[]{}".contains(c)) =>
785 {
786 break;
787 }
788 '#' if value.is_empty()
789 || self.peek_char(-1).map_or(false, |c| c.is_whitespace()) =>
790 {
791 break;
792 }
793 _ => {}
794 }
795 }
796 value.push(ch);
797 self.advance();
798 }
799
800 // If we didn't stop at a newline, this scalar is complete.
801 if !matches!(self.current_char, Some('\n' | '\r')) {
802 break;
803 }
804
805 // Per §6.5 line folding, trailing whitespace on the line is
806 // dropped (it gets replaced by the fold separator that the
807 // next continuation block emits).
808 while matches!(value.chars().last(), Some(' ' | '\t')) {
809 value.pop();
810 }
811
812 // YAML 1.2 §6.5 / §7.3.3: try to fold continuation lines into
813 // the same plain scalar. A continuation line must be:
814 // * indented strictly more than the scalar's start column,
815 // * not a document marker (`---` / `...`),
816 // * not a comment-only line,
817 // * not empty-with-EOF.
818 // Save state for backtracking if continuation isn't allowed.
819 let saved_position = self.position;
820 let saved_index = self.current_char_index;
821 let saved_char = self.current_char;
822
823 // Count physical newlines we skip; whitespace within the lines
824 // is also consumed.
825 let mut newlines = 0usize;
826 loop {
827 match self.current_char {
828 Some('\n') => {
829 newlines += 1;
830 self.advance();
831 }
832 Some('\r') => {
833 self.advance();
834 }
835 Some(' ' | '\t') => {
836 self.advance();
837 }
838 _ => break,
839 }
840 }
841
842 let next_col = self.position.column;
843 let next_ch = self.current_char;
844 let is_doc_marker = matches!(next_ch, Some('-' | '.'))
845 && self.peek_char(1) == next_ch
846 && self.peek_char(2) == next_ch
847 && self.peek_char(3).map_or(true, |c| c.is_whitespace());
848
849 // Continuation column rule:
850 // * Flow context: no column rule, only flow indicators
851 // terminate (8KB6, 8UDB, 9BXH).
852 // * Block context: must be strictly deeper than the parent
853 // block's key column. The parent indent is the max of
854 // `indent_stack.last()` (block mapping/sequence indent)
855 // and `compact_sequence_indents.last()` — the latter
856 // tracks sequences opened compactly (e.g. `? - x` where
857 // the dash didn't push to indent_stack). Without the
858 // compact-stack check, `? - Detroit Tigers\n - Chicago`
859 // would fold both lines into one scalar (yaml-test-
860 // suite M5DY).
861 // Fall back to `next_col >= start_col` for top-level
862 // scalars where there's no enclosing block.
863 let column_ok = if self.flow_level > 0 {
864 true
865 } else {
866 let block_indent = self.indent_stack.last().copied().unwrap_or(0);
867 let compact_indent = self.compact_sequence_indents.last().copied().unwrap_or(0);
868 let parent_indent = block_indent.max(compact_indent);
869 next_col >= parent_indent + 2 || next_col >= start_col
870 };
871 let can_continue = next_ch.is_some()
872 && !matches!(next_ch, Some('\n' | '\r' | '#'))
873 && column_ok
874 && !is_doc_marker
875 && !(self.flow_level > 0 && matches!(next_ch, Some(',' | ']' | '}')));
876
877 if !can_continue {
878 self.position = saved_position;
879 self.current_char_index = saved_index;
880 self.current_char = saved_char;
881 break;
882 }
883
884 // Append fold separator: single newline → space; N>1 newlines
885 // collapse to N-1 retained newlines (YAML §6.5 line folding).
886 if newlines <= 1 {
887 value.push(' ');
888 } else {
889 for _ in 0..(newlines - 1) {
890 value.push('\n');
891 }
892 }
893 multi_line = true;
894 }
895
896 // YAML 1.2 §8.1.3: implicit keys must be on a single line. If the
897 // plain scalar folded across line breaks AND the next non-
898 // whitespace char is `:` (key-value separator), it's about to be
899 // used as an implicit key — reject (yaml-test-suite G7JE).
900 if multi_line && self.flow_level == 0 {
901 let mut off = 0isize;
902 while matches!(self.peek_char(off), Some(' ' | '\t')) {
903 off += 1;
904 }
905 if self.peek_char(off) == Some(':') {
906 return Err(Error::scan(
907 self.position,
908 "Multi-line plain scalar may not be used as an implicit key".to_string(),
909 ));
910 }
911 }
912
913 self.resource_tracker
914 .check_string_length(&self.limits, value.len())?;
915
916 let value = value.trim_end().to_string();
917 let normalized_value = Self::normalize_scalar(value);
918
919 Ok(Token::new(
920 TokenType::Scalar(normalized_value, tokens::QuoteStyle::Plain),
921 start_pos,
922 self.position,
923 ))
924 }
925
926 /// Scan a quoted string
927 fn scan_quoted_string(&mut self, quote_char: char) -> Result<Token> {
928 let start_pos = self.position;
929 let mut value = String::new();
930
931 // Determine quote style based on quote character
932 let quote_style = match quote_char {
933 '\'' => tokens::QuoteStyle::Single,
934 '"' => tokens::QuoteStyle::Double,
935 _ => tokens::QuoteStyle::Plain,
936 };
937
938 self.advance(); // Skip opening quote
939 let mut closed = false;
940 let mut multi_line = false;
941 // High-water mark of bytes contributed by escape sequences. The
942 // trailing-whitespace strip at fold time must not pop past it,
943 // because an escape-produced \t / space is literal content
944 // (yaml-test-suite DE56/00, DE56/01).
945 let mut escape_end: usize = 0;
946
947 while let Some(ch) = self.current_char {
948 if ch == quote_char {
949 // YAML 1.2 §7.3.2 (Single-Quoted): `''` is the only escape,
950 // collapsing to a single `'`. Detect that here BEFORE
951 // treating the quote as the closing delimiter.
952 if quote_char == '\'' && self.peek_char(1) == Some('\'') {
953 value.push('\'');
954 self.advance();
955 self.advance();
956 continue;
957 }
958 self.advance(); // Skip closing quote
959 closed = true;
960 break;
961 } else if ch == '\\' && quote_char == '"' {
962 self.advance();
963 if let Some(escaped) = self.current_char {
964 match escaped {
965 // YAML 1.2 §5.7 double-quoted escape allowlist.
966 'n' => value.push('\n'),
967 't' => value.push('\t'),
968 'r' => value.push('\r'),
969 '\\' => value.push('\\'),
970 '"' => value.push('"'),
971 '0' => value.push('\0'),
972 'a' => value.push('\x07'),
973 'b' => value.push('\x08'),
974 'f' => value.push('\x0C'),
975 'v' => value.push('\x0B'),
976 'e' => value.push('\x1B'),
977 ' ' => value.push(' '),
978 '/' => value.push('/'),
979 'N' => value.push('\u{0085}'),
980 '_' => value.push('\u{00A0}'),
981 'L' => value.push('\u{2028}'),
982 'P' => value.push('\u{2029}'),
983 '\n' => {
984 // Escaped line break (§7.3.2): the newline is
985 // dropped AND leading whitespace on the next
986 // line is excluded from the content.
987 self.advance();
988 while matches!(self.current_char, Some(' ' | '\t')) {
989 self.advance();
990 }
991 continue;
992 }
993 '\t' => value.push('\t'), // literal tab after `\` → tab (yaml-test-suite 3RLN/DE56)
994 // Hex / Unicode escapes per YAML 1.2 §5.7:
995 // \xNN — 2 hex digits, codepoint ≤ 0xFF
996 // \uNNNN — 4 hex digits, codepoint ≤ 0xFFFF
997 // \UNNNNNNNN — 8 hex digits, full Unicode codepoint
998 'x' | 'u' | 'U' => {
999 let n = match escaped {
1000 'x' => 2,
1001 'u' => 4,
1002 _ => 8,
1003 };
1004 self.advance(); // consume the x/u/U
1005 let mut codepoint: u32 = 0;
1006 for _ in 0..n {
1007 let c = self.current_char.ok_or_else(|| {
1008 Error::scan(
1009 self.position,
1010 format!("Truncated \\{escaped} escape"),
1011 )
1012 })?;
1013 let d = c.to_digit(16).ok_or_else(|| {
1014 Error::scan(
1015 self.position,
1016 format!("Invalid hex digit `{c}` in \\{escaped} escape"),
1017 )
1018 })?;
1019 codepoint = (codepoint << 4) | d;
1020 self.advance();
1021 }
1022 let ch = char::from_u32(codepoint).ok_or_else(|| {
1023 Error::scan(
1024 self.position,
1025 format!("Invalid Unicode codepoint U+{codepoint:X}"),
1026 )
1027 })?;
1028 value.push(ch);
1029 escape_end = value.len();
1030 continue; // already advanced past hex digits
1031 }
1032 // Everything else is invalid per spec.
1033 _ => {
1034 return Err(Error::scan(
1035 self.position,
1036 format!("Invalid escape sequence: \\{escaped}"),
1037 ));
1038 }
1039 }
1040 escape_end = value.len();
1041 self.advance();
1042 }
1043 } else if ch == '\\' {
1044 // Single-quoted strings have no backslash escapes — `\` is
1045 // a literal character. (Single-quote escape is `''`.)
1046 value.push(ch);
1047 self.advance();
1048 } else if ch == '\n' || ch == '\r' {
1049 // YAML 1.2 §7.3.2 (double-quoted) / §7.3.3 (single-quoted)
1050 // line folding: a single newline within a quoted scalar
1051 // folds to a space; N>1 consecutive newlines retain N-1;
1052 // leading whitespace on the continuation line is excluded.
1053 let mut newlines = 0usize;
1054 // §6.1: tabs cannot be indentation. A continuation
1055 // line that BEGINS with a tab (no leading spaces) in
1056 // an enclosing block context is invalid (yaml-test-
1057 // suite DK95/01). Tabs that appear AFTER spaces in
1058 // the same indent area are content, not indentation.
1059 let mut just_after_newline = false;
1060 while let Some(c) = self.current_char {
1061 match c {
1062 '\n' => {
1063 newlines += 1;
1064 multi_line = true;
1065 self.advance();
1066 just_after_newline = true;
1067 }
1068 '\r' => {
1069 self.advance();
1070 }
1071 ' ' => {
1072 self.advance();
1073 just_after_newline = false;
1074 }
1075 '\t' if just_after_newline
1076 && self.flow_level == 0
1077 && (self.indent_stack.len() > 1
1078 || !self.compact_sequence_indents.is_empty()) =>
1079 {
1080 return Err(Error::scan(
1081 self.position,
1082 "Tab cannot serve as indentation of quoted scalar continuation"
1083 .to_string(),
1084 ));
1085 }
1086 '\t' => {
1087 self.advance();
1088 }
1089 _ => break,
1090 }
1091 }
1092 // §8.1.4: a multi-line quoted scalar inside a block
1093 // context must indent each continuation more than the
1094 // enclosing block. \`quoted: "a\\nb"\` with \`b\` at col 1
1095 // violates the rule because \`quoted:\` sits at indent 0
1096 // (yaml-test-suite QB6E). Only fires when there IS an
1097 // enclosing block (indent_stack > [0] or compact-seq
1098 // active) — top-level quoted scalars with continuation
1099 // at col 1 are legal.
1100 if newlines > 0
1101 && self.flow_level == 0
1102 && (self.indent_stack.len() > 1 || !self.compact_sequence_indents.is_empty())
1103 && !matches!(self.current_char, None | Some('\n' | '\r'))
1104 {
1105 let parent_indent = self.indent_stack.last().copied().unwrap_or(0);
1106 let indent = self.position.column.saturating_sub(1);
1107 if indent <= parent_indent {
1108 return Err(Error::scan(
1109 self.position,
1110 "Quoted scalar continuation line is not indented enough".to_string(),
1111 ));
1112 }
1113 }
1114 // §6.8: a doc-start/end marker (`---` or `...`) at
1115 // column 1 always terminates the current document.
1116 // Encountering one inside an unterminated quoted
1117 // scalar is invalid — the quote escapes nothing past
1118 // the doc boundary (yaml-test-suite 5TRB, RXY3,
1119 // 9MQT/01).
1120 if self.position.column == 1 {
1121 let next3: String = self
1122 .char_cache
1123 .get(self.current_char_index..self.current_char_index + 3)
1124 .map(|s| s.iter().collect())
1125 .unwrap_or_default();
1126 if (next3 == "---" || next3 == "...")
1127 && self
1128 .char_cache
1129 .get(self.current_char_index + 3)
1130 .map_or(true, |c| c.is_whitespace())
1131 {
1132 return Err(Error::scan(
1133 self.position,
1134 format!(
1135 "Document {} marker `{}` inside quoted scalar",
1136 if next3 == "---" { "start" } else { "end" },
1137 next3
1138 ),
1139 ));
1140 }
1141 }
1142 // Drop trailing whitespace on the prior line (the bytes
1143 // we already pushed) before applying the fold. Don't
1144 // strip past `escape_end` — escape-produced whitespace
1145 // is literal content, not "trailing" line whitespace.
1146 while value.len() > escape_end && matches!(value.chars().last(), Some(' ' | '\t')) {
1147 value.pop();
1148 }
1149 if newlines <= 1 {
1150 value.push(' ');
1151 } else {
1152 for _ in 0..(newlines - 1) {
1153 value.push('\n');
1154 }
1155 }
1156 } else {
1157 value.push(ch);
1158 self.advance();
1159
1160 // Check string length periodically to fail fast
1161 if value.len() > self.limits.max_string_length {
1162 return Err(Error::limit_exceeded(format!(
1163 "String length {} exceeds maximum {}",
1164 value.len(),
1165 self.limits.max_string_length
1166 )));
1167 }
1168 }
1169 }
1170
1171 // Check string length limit
1172 if !closed {
1173 return Err(Error::scan(
1174 self.position,
1175 format!(
1176 "Unclosed {} quoted string",
1177 if quote_char == '"' {
1178 "double"
1179 } else {
1180 "single"
1181 }
1182 ),
1183 ));
1184 }
1185
1186 self.resource_tracker
1187 .check_string_length(&self.limits, value.len())?;
1188
1189 // YAML 1.2 §7.3.1 / §7.3.2: after the closing quote, the rest of
1190 // the line (or sub-expression in flow context) must be empty save
1191 // for a separator. Skip horizontal whitespace and look at the next
1192 // non-space char; if it's content rather than `,`/`:`/`}`/`]`/`#`/
1193 // newline/EOF, it's a trailing-content error (yaml-test-suite
1194 // Q4CL: `"quoted2" trailing content`).
1195 {
1196 let mut offset = 0isize;
1197 let mut saw_space = false;
1198 while matches!(self.peek_char(offset), Some(' ' | '\t')) {
1199 saw_space = true;
1200 offset += 1;
1201 }
1202 let next = self.peek_char(offset);
1203 // A `#` is a comment indicator ONLY when preceded by whitespace
1204 // (YAML 1.2 §6.6); `"value"#cmt` is invalid.
1205 let ok = match next {
1206 None => true,
1207 Some('#') => saw_space,
1208 Some(c) => matches!(c, ',' | ':' | '}' | ']' | '\n' | '\r'),
1209 };
1210 if !ok {
1211 return Err(Error::scan(
1212 self.position,
1213 format!("Unexpected `{}` after quoted scalar", next.unwrap_or(' ')),
1214 ));
1215 }
1216 // YAML 1.2 §8.1.3: implicit keys must be on a single line.
1217 // If the scalar folded across line breaks AND the next non-
1218 // whitespace char is `:` (key-value separator), the scalar
1219 // is being used as an implicit key — error.
1220 if multi_line && self.flow_level == 0 && next == Some(':') {
1221 return Err(Error::scan(
1222 self.position,
1223 "Multi-line quoted scalar may not be used as an implicit key".to_string(),
1224 ));
1225 }
1226 }
1227
1228 Ok(Token::new(
1229 TokenType::Scalar(value, quote_style),
1230 start_pos,
1231 self.position,
1232 ))
1233 }
1234
1235 /// Scan document start marker (---)
1236 fn scan_document_start(&mut self) -> Result<Option<Token>> {
1237 if self.current_char == Some('-')
1238 && self.peek_char(1) == Some('-')
1239 && self.peek_char(2) == Some('-')
1240 && self.peek_char(3).map_or(true, |c| c.is_whitespace())
1241 {
1242 // Doc markers are invalid inside flow collections.
1243 if self.flow_level > 0 {
1244 return Err(Error::scan(
1245 self.position,
1246 "`---` document-start marker is not allowed inside a flow collection"
1247 .to_string(),
1248 ));
1249 }
1250 let start_pos = self.position;
1251 self.advance(); // -
1252 self.advance(); // -
1253 self.advance(); // -
1254
1255 Ok(Some(Token::new(
1256 TokenType::DocumentStart,
1257 start_pos,
1258 self.position,
1259 )))
1260 } else {
1261 Ok(None)
1262 }
1263 }
1264
1265 /// Scan YAML version directive (%YAML)
1266 fn scan_yaml_directive(&mut self) -> Result<Option<Token>> {
1267 if self.current_char != Some('%') {
1268 return Ok(None);
1269 }
1270
1271 let start_pos = self.position;
1272 let saved_position = self.position;
1273 let saved_char = self.current_char;
1274 let saved_char_index = self.current_char_index;
1275 self.advance(); // Skip '%'
1276
1277 // Check for "YAML"
1278 if self.current_char == Some('Y')
1279 && self.peek_char(1) == Some('A')
1280 && self.peek_char(2) == Some('M')
1281 && self.peek_char(3) == Some('L')
1282 && self.peek_char(4).map_or(false, |c| c.is_whitespace())
1283 {
1284 self.advance(); // Y
1285 self.advance(); // A
1286 self.advance(); // M
1287 self.advance(); // L
1288
1289 // Skip whitespace
1290 self.skip_whitespace();
1291
1292 // Parse version number (e.g., "1.2")
1293 let major = if let Some(ch) = self.current_char {
1294 if ch.is_ascii_digit() {
1295 let digit = ch.to_digit(10).unwrap() as u8;
1296 self.advance();
1297 digit
1298 } else {
1299 return Err(Error::scan(
1300 self.position,
1301 "Expected major version number after %YAML".to_string(),
1302 ));
1303 }
1304 } else {
1305 return Err(Error::scan(
1306 self.position,
1307 "Expected version after %YAML directive".to_string(),
1308 ));
1309 };
1310
1311 // Expect '.'
1312 if self.current_char != Some('.') {
1313 return Err(Error::scan(
1314 self.position,
1315 "Expected '.' in YAML version".to_string(),
1316 ));
1317 }
1318 self.advance();
1319
1320 // Parse minor version
1321 let minor = if let Some(ch) = self.current_char {
1322 if ch.is_ascii_digit() {
1323 let digit = ch.to_digit(10).unwrap() as u8;
1324 self.advance();
1325 digit
1326 } else {
1327 return Err(Error::scan(
1328 self.position,
1329 "Expected minor version number after '.'".to_string(),
1330 ));
1331 }
1332 } else {
1333 return Err(Error::scan(
1334 self.position,
1335 "Expected minor version number".to_string(),
1336 ));
1337 };
1338
1339 // YAML 1.2 §6.8.1: the directive line must end after the
1340 // version (modulo whitespace and an optional comment). Extra
1341 // tokens (e.g. `%YAML 1.2 foo`) are invalid — yaml-test-suite
1342 // H7TQ. Also `%YAML 1.1#...` (yaml-test-suite MUS6/00) needs
1343 // whitespace before `#`.
1344 let mut saw_space = false;
1345 while matches!(self.current_char, Some(' ' | '\t')) {
1346 saw_space = true;
1347 self.advance();
1348 }
1349 match self.current_char {
1350 None | Some('\n' | '\r') => {}
1351 Some('#') if saw_space => {
1352 while let Some(ch) = self.current_char {
1353 if ch == '\n' || ch == '\r' {
1354 break;
1355 }
1356 self.advance();
1357 }
1358 }
1359 Some(c) => {
1360 return Err(Error::scan(
1361 self.position,
1362 format!("Unexpected `{c}` after %YAML directive"),
1363 ));
1364 }
1365 }
1366
1367 Ok(Some(Token::new(
1368 TokenType::YamlDirective(major, minor),
1369 start_pos,
1370 self.position,
1371 )))
1372 } else {
1373 // Not a YAML directive: restore the exact pre-`%` scanner state
1374 // in O(1). The previous code linear-scanned a char_indices side
1375 // table; saving the two cursor fields is both faster and lets
1376 // that table be dropped entirely (#26).
1377 self.position = saved_position;
1378 self.current_char = saved_char;
1379 self.current_char_index = saved_char_index;
1380 Ok(None)
1381 }
1382 }
1383
1384 /// Scan TAG directive (%TAG)
1385 fn scan_tag_directive(&mut self) -> Result<Option<Token>> {
1386 if self.current_char != Some('%') {
1387 return Ok(None);
1388 }
1389
1390 let start_pos = self.position;
1391 let saved_position = self.position;
1392 let saved_char = self.current_char;
1393 let saved_char_index = self.current_char_index;
1394 self.advance(); // Skip '%'
1395
1396 // Check for "TAG"
1397 if self.current_char == Some('T')
1398 && self.peek_char(1) == Some('A')
1399 && self.peek_char(2) == Some('G')
1400 && self.peek_char(3).map_or(false, |c| c.is_whitespace())
1401 {
1402 self.advance(); // T
1403 self.advance(); // A
1404 self.advance(); // G
1405
1406 // Skip whitespace
1407 self.skip_whitespace();
1408
1409 // Parse handle (e.g., "!" or "!!")
1410 let handle = self.scan_tag_handle()?;
1411
1412 // Skip whitespace
1413 self.skip_whitespace();
1414
1415 // Parse prefix (URI)
1416 let prefix = self.scan_tag_prefix()?;
1417
1418 Ok(Some(Token::new(
1419 TokenType::TagDirective(handle, prefix),
1420 start_pos,
1421 self.position,
1422 )))
1423 } else {
1424 // Not a TAG directive: restore the exact pre-`%` scanner state
1425 // in O(1) (see the matching note in scan_yaml_directive, #26).
1426 self.position = saved_position;
1427 self.current_char = saved_char;
1428 self.current_char_index = saved_char_index;
1429 Ok(None)
1430 }
1431 }
1432
1433 /// Scan a tag handle for TAG directive
1434 fn scan_tag_handle(&mut self) -> Result<String> {
1435 let mut handle = String::new();
1436
1437 if self.current_char != Some('!') {
1438 return Err(Error::scan(
1439 self.position,
1440 "Expected '!' at start of tag handle".to_string(),
1441 ));
1442 }
1443
1444 handle.push('!');
1445 self.advance();
1446
1447 // Handle can be "!" or "!!" or "!name!"
1448 if self.current_char == Some('!') {
1449 // Secondary handle "!!"
1450 handle.push('!');
1451 self.advance();
1452 } else if self.current_char.map_or(false, |c| c.is_alphanumeric()) {
1453 // Named handle like "!name!"
1454 while let Some(ch) = self.current_char {
1455 if ch.is_alphanumeric() || ch == '-' || ch == '_' {
1456 handle.push(ch);
1457 self.advance();
1458 } else if ch == '!' {
1459 handle.push(ch);
1460 self.advance();
1461 break;
1462 } else {
1463 break;
1464 }
1465 }
1466 }
1467 // else just "!" primary handle
1468
1469 Ok(handle)
1470 }
1471
1472 /// Scan a tag prefix (URI) for TAG directive
1473 fn scan_tag_prefix(&mut self) -> Result<String> {
1474 let mut prefix = String::new();
1475
1476 // Read until end of line or comment
1477 while let Some(ch) = self.current_char {
1478 if ch == '\n' || ch == '\r' || ch == '#' {
1479 break;
1480 }
1481 if ch.is_whitespace() && prefix.is_empty() {
1482 self.advance();
1483 continue;
1484 }
1485 if ch.is_whitespace() && !prefix.is_empty() {
1486 // Trailing whitespace, we're done
1487 break;
1488 }
1489 prefix.push(ch);
1490 self.advance();
1491 }
1492
1493 if prefix.is_empty() {
1494 return Err(Error::scan(
1495 self.position,
1496 "Expected tag prefix after tag handle".to_string(),
1497 ));
1498 }
1499
1500 Ok(prefix.trim().to_string())
1501 }
1502
1503 /// Check if current position might be a directive
1504 fn is_directive(&self) -> bool {
1505 self.current_char == Some('%') && self.position.column == 1
1506 }
1507
1508 /// Scan document end marker (...)
1509 fn scan_document_end(&mut self) -> Result<Option<Token>> {
1510 if self.current_char == Some('.')
1511 && self.peek_char(1) == Some('.')
1512 && self.peek_char(2) == Some('.')
1513 && self.peek_char(3).map_or(true, |c| c.is_whitespace())
1514 {
1515 // Doc markers are invalid inside flow collections.
1516 if self.flow_level > 0 {
1517 return Err(Error::scan(
1518 self.position,
1519 "`...` document-end marker is not allowed inside a flow collection".to_string(),
1520 ));
1521 }
1522 let start_pos = self.position;
1523 self.advance(); // .
1524 self.advance(); // .
1525 self.advance(); // .
1526
1527 // YAML 1.2 §6.4: `...` must be followed only by whitespace or
1528 // end-of-line (comments allowed). Inline content after `...`
1529 // is invalid (yaml-test-suite 3HFZ).
1530 while let Some(ch) = self.current_char {
1531 match ch {
1532 ' ' | '\t' => {
1533 self.advance();
1534 }
1535 '\n' | '\r' | '#' => break,
1536 _ => {
1537 return Err(Error::scan(
1538 self.position,
1539 "Content after `...` document-end marker is invalid".to_string(),
1540 ));
1541 }
1542 }
1543 }
1544
1545 Ok(Some(Token::new(
1546 TokenType::DocumentEnd,
1547 start_pos,
1548 self.position,
1549 )))
1550 } else {
1551 Ok(None)
1552 }
1553 }
1554
1555 /// Scan a comment token
1556 fn scan_comment(&mut self) -> Result<Token> {
1557 let start_pos = self.position;
1558 let mut comment_text = String::new();
1559
1560 // Skip the '#' character
1561 if self.current_char == Some('#') {
1562 self.advance();
1563 }
1564
1565 // Collect the comment text
1566 while let Some(ch) = self.current_char {
1567 if ch == '\n' || ch == '\r' {
1568 break;
1569 }
1570 comment_text.push(ch);
1571 self.advance();
1572 }
1573
1574 // Trim leading whitespace from comment text
1575 let comment_text = comment_text.trim_start().to_string();
1576
1577 Ok(Token::new(
1578 TokenType::Comment(comment_text),
1579 start_pos,
1580 self.position,
1581 ))
1582 }
1583
1584 /// Process a line and generate appropriate tokens
1585 #[allow(clippy::cognitive_complexity)]
1586 fn process_line(&mut self) -> Result<()> {
1587 // Check for directives at start of line
1588 if self.position.column == 1 && self.current_char == Some('%') {
1589 // Try to scan YAML directive
1590 if let Some(token) = self.scan_yaml_directive()? {
1591 self.tokens.push(token);
1592 return Ok(());
1593 }
1594
1595 // Try to scan TAG directive
1596 if let Some(token) = self.scan_tag_directive()? {
1597 self.tokens.push(token);
1598 return Ok(());
1599 }
1600
1601 // YAML 1.2 §6.8.4: a YAML processor MUST ignore directives it
1602 // does not recognize. Skip the line silently — parsing continues
1603 // with whatever follows on the next line.
1604 if self.current_char == Some('%') {
1605 while let Some(ch) = self.current_char {
1606 if ch == '\n' || ch == '\r' {
1607 break;
1608 }
1609 self.advance();
1610 }
1611 return Ok(());
1612 }
1613 }
1614
1615 // Check for document markers at start of line
1616 if self.position.column == 1 {
1617 // Check for document start marker
1618 if let Some(token) = self.scan_document_start()? {
1619 self.tokens.push(token);
1620 return Ok(());
1621 }
1622
1623 // Check for document end marker
1624 if let Some(token) = self.scan_document_end()? {
1625 self.tokens.push(token);
1626 return Ok(());
1627 }
1628 }
1629
1630 // Handle indentation at start of line
1631 if self.position.column == 1 {
1632 self.handle_indentation()?;
1633 }
1634
1635 // Skip empty lines and comments
1636 self.skip_whitespace();
1637
1638 match self.current_char {
1639 None => return Ok(()),
1640 Some('#') => {
1641 if self.preserve_comments {
1642 // Create a comment token
1643 let comment_token = self.scan_comment()?;
1644 self.tokens.push(comment_token);
1645 } else {
1646 // Skip comment lines
1647 while let Some(ch) = self.current_char {
1648 if ch == '\n' || ch == '\r' {
1649 break;
1650 }
1651 self.advance();
1652 }
1653 }
1654 return Ok(());
1655 }
1656 Some('\n' | '\r') => {
1657 self.advance();
1658 return Ok(());
1659 }
1660 _ => {}
1661 }
1662
1663 // Process tokens on this line
1664 while let Some(ch) = self.current_char {
1665 match ch {
1666 '\n' | '\r' => break,
1667 ' ' | '\t' => {
1668 self.skip_whitespace();
1669 }
1670 '#' => {
1671 // YAML 1.2 §6.6: a comment must be preceded by whitespace
1672 // OR be at the start of a line. Inputs like `,#invalid`
1673 // (yaml-test-suite CVW2) are not valid comments.
1674 let prev = self.peek_char(-1);
1675 let at_line_start = self.position.column == 1;
1676 let preceded_by_space = prev.map_or(true, |c| c.is_whitespace());
1677 if !at_line_start && !preceded_by_space {
1678 return Err(Error::scan(
1679 self.position,
1680 "Comment `#` must be preceded by whitespace".to_string(),
1681 ));
1682 }
1683 if self.preserve_comments {
1684 let comment_token = self.scan_comment()?;
1685 self.tokens.push(comment_token);
1686 } else {
1687 while let Some(ch) = self.current_char {
1688 if ch == '\n' || ch == '\r' {
1689 break;
1690 }
1691 self.advance();
1692 }
1693 }
1694 break;
1695 }
1696
1697 // Flow indicators. §7.4 allows a flow collection as
1698 // the implicit key of a block mapping (`[a]: b`,
1699 // `{x: y}: z`). When the flow-open is at line-start
1700 // (block context) and a `:` follows on the same line,
1701 // open the wrapping block mapping at the column of the
1702 // flow-open token, just as we do for line-start
1703 // properties (yaml-test-suite LX3P, 4FJ6, M2N8/01).
1704 '[' => {
1705 if self.flow_level == 0
1706 && self.position.column == self.current_indent + 1
1707 && self.check_for_mapping_ahead()
1708 {
1709 self.maybe_open_block_mapping_for_key()?;
1710 }
1711 let pos = self.position;
1712 self.advance();
1713 self.flow_level += 1;
1714 // Check depth limit
1715 self.resource_tracker
1716 .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
1717 self.tokens
1718 .push(Token::new(TokenType::FlowSequenceStart, pos, self.position));
1719 }
1720 ']' => {
1721 // YAML 1.2 §7.4: `]` is only valid inside an open
1722 // flow sequence. Stray `]` is a syntax error
1723 // (yaml-test-suite 4H7K).
1724 if self.flow_level == 0 {
1725 let context = ErrorContext::from_input(&self.input, &self.position, 2)
1726 .with_suggestion(
1727 "Remove the extra `]` or open a flow sequence with `[` first"
1728 .to_string(),
1729 );
1730 return Err(Error::scan_with_context(
1731 self.position,
1732 "Unexpected `]` outside flow context",
1733 context,
1734 ));
1735 }
1736 let pos = self.position;
1737 self.advance();
1738 self.flow_level -= 1;
1739 self.tokens
1740 .push(Token::new(TokenType::FlowSequenceEnd, pos, self.position));
1741 }
1742 '{' => {
1743 if self.flow_level == 0
1744 && self.position.column == self.current_indent + 1
1745 && self.check_for_mapping_ahead()
1746 {
1747 self.maybe_open_block_mapping_for_key()?;
1748 }
1749 let pos = self.position;
1750 self.advance();
1751 self.flow_level += 1;
1752 // Check depth limit
1753 self.resource_tracker
1754 .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
1755 self.tokens
1756 .push(Token::new(TokenType::FlowMappingStart, pos, self.position));
1757 }
1758 '}' => {
1759 if self.flow_level == 0 {
1760 let context = ErrorContext::from_input(&self.input, &self.position, 2)
1761 .with_suggestion(
1762 "Remove the extra `}` or open a flow mapping with `{` first"
1763 .to_string(),
1764 );
1765 return Err(Error::scan_with_context(
1766 self.position,
1767 "Unexpected `}` outside flow context",
1768 context,
1769 ));
1770 }
1771 let pos = self.position;
1772 self.advance();
1773 self.flow_level -= 1;
1774 self.tokens
1775 .push(Token::new(TokenType::FlowMappingEnd, pos, self.position));
1776 }
1777 ',' => {
1778 // §7.4: \`,\` is a flow indicator. Outside flow
1779 // context it's not meaningful as a structural
1780 // separator (yaml-test-suite U99R: \`- !!str, xxx\`
1781 // — the comma after a tag in block context is
1782 // invalid).
1783 if self.flow_level == 0 {
1784 return Err(Error::scan(
1785 self.position,
1786 "Unexpected `,` outside flow context".to_string(),
1787 ));
1788 }
1789 let pos = self.position;
1790 self.advance();
1791 self.tokens
1792 .push(Token::new(TokenType::FlowEntry, pos, self.position));
1793 }
1794
1795 // Key-value separator. YAML 1.2 §7.3.3 / §7.4:
1796 // * Block context: `:` separates key from value only when
1797 // followed by whitespace / EOF — otherwise it's part of
1798 // a plain scalar (e.g. `:foo`, `URL://path`).
1799 // * Flow context: same, plus `:` may be adjacent to a
1800 // value when the previous token completed a key node
1801 // (quoted/plain scalar, alias, or closed flow
1802 // collection) — see yaml-test-suite 5MUD, 5T43.
1803 ':' if self.peek_char(1).map_or(true, |c| {
1804 c.is_whitespace() || (self.flow_level > 0 && ",[]{}".contains(c))
1805 }) || (self.flow_level > 0
1806 && matches!(
1807 self.tokens.last().map(|t| &t.token_type),
1808 Some(
1809 TokenType::Scalar(_, _)
1810 | TokenType::Alias(_)
1811 | TokenType::FlowMappingEnd
1812 | TokenType::FlowSequenceEnd
1813 )
1814 )) =>
1815 {
1816 // §6.2: a \`:\` at line-start (the explicit-value
1817 // counterpart of an explicit \`?\` key) must be
1818 // followed by a SPACE — a tab as separator is
1819 // invalid (yaml-test-suite Y79Y/007, /009).
1820 if self.flow_level == 0
1821 && self.position.column == self.current_indent + 1
1822 && self.peek_char(1) == Some('\t')
1823 {
1824 return Err(Error::scan(
1825 self.position,
1826 "Tab cannot follow line-start `:` as explicit-value separator"
1827 .to_string(),
1828 ));
1829 }
1830 // §8.22: an implicit key in block context must fit
1831 // on a single line. If the previous token is a
1832 // flow-collection close whose matching open is on
1833 // a different line, the flow node spans multiple
1834 // lines and can't serve as the key (yaml-test-
1835 // suite C2SP \`[23\\n]: 42\`).
1836 if self.flow_level == 0 {
1837 let mut is_flow_close = false;
1838 let mut close_end_line = 0;
1839 if let Some(last) = self.tokens.last() {
1840 if matches!(
1841 last.token_type,
1842 TokenType::FlowSequenceEnd | TokenType::FlowMappingEnd
1843 ) {
1844 is_flow_close = true;
1845 close_end_line = last.end_position.line;
1846 }
1847 }
1848 if is_flow_close {
1849 let mut depth = 0i32;
1850 let mut open_idx: Option<usize> = None;
1851 for (idx, t) in self.tokens.iter().enumerate().rev() {
1852 match &t.token_type {
1853 TokenType::FlowSequenceEnd | TokenType::FlowMappingEnd => {
1854 depth += 1;
1855 }
1856 TokenType::FlowSequenceStart | TokenType::FlowMappingStart => {
1857 depth -= 1;
1858 if depth == 0 {
1859 open_idx = Some(idx);
1860 break;
1861 }
1862 }
1863 _ => {}
1864 }
1865 }
1866 if let Some(oi) = open_idx {
1867 let open_line = self.tokens[oi].start_position.line;
1868 // If a `?` (Key) token precedes the
1869 // matching flow open on the same line
1870 // as the key, the key is explicit and
1871 // may span lines (yaml-test-suite M5DY
1872 // \`? [ ...spans... ]: [ ... ]\`).
1873 let key_marker_before = self.tokens[..oi].iter().rev().any(|t| {
1874 matches!(t.token_type, TokenType::Key)
1875 && t.start_position.line == open_line
1876 });
1877 if !key_marker_before && open_line != close_end_line {
1878 return Err(Error::scan(
1879 self.position,
1880 "Implicit key in block context: flow collection key spans multiple lines"
1881 .to_string(),
1882 ));
1883 }
1884 }
1885 }
1886 }
1887 let pos = self.position;
1888 self.advance();
1889 self.tokens
1890 .push(Token::new(TokenType::Value, pos, self.position));
1891 }
1892
1893 // §6.2: the explicit-key marker \`?\` must be followed
1894 // by a SPACE (or EOL), not a tab. Tab as separator
1895 // after \`?\` is invalid (yaml-test-suite Y79Y/006, /008).
1896 '?' if self.flow_level == 0 && self.peek_char(1) == Some('\t') => {
1897 return Err(Error::scan(
1898 self.position,
1899 "Tab cannot follow `?` as block-key separator".to_string(),
1900 ));
1901 }
1902
1903 // Explicit key marker. An indented `?` at line-start
1904 // (e.g. `mapping:\\n ? key`) opens an implicit block
1905 // mapping at this column — same as a line-start scalar
1906 // key. Without this, scan_plain_scalar wouldn't see
1907 // the inner mapping's indent and would wrongly fold
1908 // the key content into a multi-line scalar
1909 // (yaml-test-suite S9E8, KK5P).
1910 '?' if self.flow_level == 0
1911 && (self.peek_char(1).map_or(true, |c| c.is_whitespace())
1912 || self.peek_char(1).is_none()) =>
1913 {
1914 if self.position.column == self.current_indent + 1 {
1915 self.maybe_open_block_mapping_for_key()?;
1916 }
1917 let pos = self.position;
1918 self.advance();
1919 self.tokens
1920 .push(Token::new(TokenType::Key, pos, self.position));
1921 }
1922 '?' if self.flow_level > 0
1923 && (self
1924 .peek_char(1)
1925 .map_or(true, |c| c.is_whitespace() || ",:]}".contains(c))
1926 || self.peek_char(1).is_none()) =>
1927 {
1928 let pos = self.position;
1929 self.advance();
1930 self.tokens
1931 .push(Token::new(TokenType::Key, pos, self.position));
1932 }
1933
1934 // Block entry
1935 '-' if self.flow_level == 0
1936 && (self.peek_char(1).map_or(true, |c| c.is_whitespace())
1937 || self.peek_char(1).is_none()) =>
1938 {
1939 // A block-entry \`-\` immediately after a flow
1940 // collection's close (\`}\`, \`]\`) ON THE SAME LINE
1941 // is invalid — no separator between the closed
1942 // flow node and the next sibling (yaml-test-suite
1943 // P2EQ \`- { y: z }- invalid\`). The same-line guard
1944 // is essential — a \`}\` on a previous line with a
1945 // new \`-\` on the next line is perfectly valid.
1946 //
1947 // Likewise, a block-entry \`-\` immediately after a
1948 // property (Anchor / Tag) on the same line is
1949 // invalid — the property must precede a node, and
1950 // a block sequence's first \`-\` must begin a line
1951 // (yaml-test-suite SY6V \`&anchor - x\`).
1952 if let Some(last) = self.tokens.last() {
1953 if matches!(
1954 last.token_type,
1955 TokenType::FlowMappingEnd | TokenType::FlowSequenceEnd
1956 ) && last.end_position.line == self.position.line
1957 {
1958 return Err(Error::scan(
1959 self.position,
1960 "Block-entry `-` immediately after flow collection close"
1961 .to_string(),
1962 ));
1963 }
1964 if matches!(last.token_type, TokenType::Anchor(_) | TokenType::Tag(_))
1965 && last.end_position.line == self.position.line
1966 {
1967 return Err(Error::scan(
1968 self.position,
1969 "Block-entry `-` cannot follow a property on the same line"
1970 .to_string(),
1971 ));
1972 }
1973 // §8.22: a block sequence's first \`-\` must
1974 // begin on a new line. \`key: - a\` (implicit
1975 // key, then dash on same line) is invalid
1976 // (yaml-test-suite 5U3A). But \`? key\\n: - x\`
1977 // (explicit value-separator on the same line
1978 // as the dash) IS valid: the \`?\` key sits
1979 // on a previous line. We distinguish by
1980 // walking back from the Value: if the
1981 // preceding non-property token is a Scalar
1982 // on the same line as the Value, the key
1983 // is implicit; otherwise it's after \`?\`.
1984 if matches!(last.token_type, TokenType::Value)
1985 && last.end_position.line == self.position.line
1986 {
1987 let value_line = last.start_position.line;
1988 let mut prior_scalar_line = None;
1989 for t in self.tokens.iter().rev().skip(1) {
1990 match &t.token_type {
1991 TokenType::Anchor(_) | TokenType::Tag(_) => {}
1992 TokenType::Scalar(..) => {
1993 prior_scalar_line = Some(t.end_position.line);
1994 break;
1995 }
1996 _ => break,
1997 }
1998 }
1999 if prior_scalar_line == Some(value_line) {
2000 return Err(Error::scan(
2001 self.position,
2002 "Block sequence value cannot start on the same line as its key"
2003 .to_string(),
2004 ));
2005 }
2006 }
2007 }
2008 let pos = self.position;
2009 self.advance();
2010
2011 // Check if we need to start a new block sequence.
2012 // `unwrap_or(0)` mirrors the pattern in
2013 // src/scanner/indentation.rs and is safer than
2014 // `.unwrap()` here: an error-recovery pop in another
2015 // path could otherwise leave the stack empty and
2016 // panic on crafted input (#18).
2017 let last_indent = self.indent_stack.last().copied().unwrap_or(0);
2018
2019 // If a compact sequence (opened from `? - x` or
2020 // similar) is already active at this dash's column,
2021 // the dash continues it — don't open a new nested
2022 // block sequence (yaml-test-suite M5DY).
2023 let dash_indent = pos.column.saturating_sub(1);
2024 let compact_active_here = self
2025 .compact_sequence_indents
2026 .last()
2027 .map_or(false, |&si| si == dash_indent);
2028 if compact_active_here {
2029 // Continuation of an existing compact sequence.
2030 } else if self.current_indent > last_indent {
2031 // Deeper indentation - start new nested sequence
2032 self.indent_stack.push(self.current_indent);
2033 self.indent_is_sequence.push(true);
2034 // Check depth limit
2035 self.resource_tracker
2036 .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
2037 self.tokens
2038 .push(Token::simple(TokenType::BlockSequenceStart, pos));
2039 } else if self.current_indent == last_indent
2040 && *self.indent_is_sequence.last().unwrap_or(&false)
2041 {
2042 // Same indent and the top of stack is already a sequence
2043 // → continuation of that sequence; no new start needed.
2044 } else if self.current_indent >= last_indent {
2045 // Same or root level — compact notation.
2046 // Start a new sequence only if we don't already have one
2047 // tracked at this exact indent.
2048 // For a dash that's *not* at line-start (e.g.
2049 // `? - x` where current_indent is still the
2050 // line's indent but the dash sits in mid-line),
2051 // use the dash column - 1 as the sequence's
2052 // indent so scan_plain_scalar's continuation
2053 // check correctly sees the deeper context
2054 // (yaml-test-suite M5DY).
2055 let dash_indent = pos.column.saturating_sub(1);
2056 let seq_indent = dash_indent.max(self.current_indent);
2057 let has_active_compact = self
2058 .compact_sequence_indents
2059 .last()
2060 .map_or(false, |&si| si == seq_indent);
2061
2062 if !has_active_compact {
2063 self.compact_sequence_indents.push(seq_indent);
2064 // Check depth limit
2065 self.resource_tracker.check_depth(
2066 &self.limits,
2067 self.flow_level + self.indent_stack.len(),
2068 )?;
2069 self.tokens
2070 .push(Token::simple(TokenType::BlockSequenceStart, pos));
2071 }
2072 }
2073
2074 self.tokens
2075 .push(Token::new(TokenType::BlockEntry, pos, self.position));
2076
2077 // After emitting BlockEntry, check if the next
2078 // token is another dash (nested sequence). §6.2
2079 // requires SPACE separation between dashes — a
2080 // tab between the outer and inner \`-\` is invalid
2081 // (yaml-test-suite Y79Y/004, /005). Track whether
2082 // a tab was consumed while skipping the inter-
2083 // dash whitespace and reject if so.
2084 let mut saw_tab_between = false;
2085 while let Some(c) = self.current_char {
2086 if c == ' ' {
2087 self.advance();
2088 } else if c == '\t' {
2089 saw_tab_between = true;
2090 self.advance();
2091 } else {
2092 break;
2093 }
2094 }
2095 if self.current_char == Some('-')
2096 && self.peek_char(1).map_or(true, |c| c.is_whitespace())
2097 && saw_tab_between
2098 {
2099 return Err(Error::scan(
2100 self.position,
2101 "Tab between block-entries on same line".to_string(),
2102 ));
2103 }
2104 if self.current_char == Some('-')
2105 && self.peek_char(1).map_or(true, |c| c.is_whitespace())
2106 {
2107 // We have a nested sequence on the same line!
2108 // Track this as an inline sequence
2109 self.inline_sequence_depth += 1;
2110 // Push the *indent* (column - 1), not the
2111 // column, so it matches the convention used by
2112 // maybe_open_block_mapping_for_key. With column
2113 // here the next-line indent (column - 1) would
2114 // be strictly less than the stored value and
2115 // wrongly trigger an early close, breaking
2116 // multi-line nested sequences (yaml-test-suite
2117 // 3ALJ, 57H4).
2118 self.indent_stack
2119 .push(self.position.column.saturating_sub(1));
2120 self.indent_is_sequence.push(true);
2121 // Check depth limit
2122 self.resource_tracker
2123 .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
2124 self.tokens
2125 .push(Token::simple(TokenType::BlockSequenceStart, self.position));
2126 // Continue processing - the next iteration will handle the nested dash
2127 } else if self.current_char.is_some()
2128 && !matches!(self.current_char, Some('\n' | '\r'))
2129 {
2130 // Content follows "- " on the same line.
2131 // Update current_indent to the content's column position so that
2132 // any mapping started here will be at a deeper indent level than
2133 // the sequence. This ensures handle_indentation properly closes
2134 // the mapping when the next sibling "- " appears.
2135 self.current_indent = self.position.column - 1;
2136 }
2137 }
2138
2139 // Quoted strings — same implicit-key mapping detection
2140 // as for plain scalars (yaml-test-suite 6H3V, 6SLA).
2141 '"' | '\'' => {
2142 if self.flow_level == 0 && self.check_for_mapping_ahead() {
2143 self.maybe_open_block_mapping_for_key()?;
2144 }
2145 let token = self.scan_quoted_string(ch)?;
2146 self.tokens.push(token);
2147 }
2148
2149 // Document markers (only if not a block entry).
2150 //
2151 // Reached only when `-` is at column = current_indent + 1 AND
2152 // the next character is non-whitespace — i.e. either the
2153 // `---` document-start marker OR a plain scalar starting
2154 // with `-` (e.g. `---word1`, `-foo`). If `scan_document_start`
2155 // declines, we MUST consume the run as a plain scalar — not
2156 // consulting `is_plain_scalar_start` here, because that helper
2157 // unconditionally rejects `-`, which would leave the outer
2158 // `while let` loop spinning on the same character.
2159 '-' if self.position.column == self.current_indent + 1
2160 && !self.peek_char(1).map_or(true, |c| c.is_whitespace()) =>
2161 {
2162 if let Some(token) = self.scan_document_start()? {
2163 self.tokens.push(token);
2164 } else {
2165 let token = self.scan_plain_scalar()?;
2166 self.tokens.push(token);
2167 }
2168 }
2169 '.' if self.position.column == self.current_indent + 1 => {
2170 if let Some(token) = self.scan_document_end()? {
2171 self.tokens.push(token);
2172 } else if self.is_plain_scalar_start() {
2173 let token = self.scan_plain_scalar()?;
2174 self.tokens.push(token);
2175 }
2176 }
2177
2178 // Numbers or plain scalars starting with -
2179 // Only scan as number if the entire token is numeric (no trailing letters)
2180 _ if (ch.is_ascii_digit()
2181 || (ch == '-' && self.peek_char(1).map_or(false, |c| c.is_ascii_digit())))
2182 && self.is_pure_number() =>
2183 {
2184 // A numeric scalar can be an implicit mapping key just
2185 // like any other scalar. Open the block mapping before
2186 // the key token so `BlockMappingStart` is emitted —
2187 // every other scalar dispatch arm does this; the number
2188 // arm previously skipped it, so `421: null` parsed as a
2189 // bare scalar instead of a mapping (#66).
2190 if self.flow_level == 0 && self.check_for_mapping_ahead() {
2191 self.maybe_open_block_mapping_for_key()?;
2192 }
2193 let token = self.scan_number()?;
2194 self.tokens.push(token);
2195 }
2196
2197 // Anchors and aliases. §6.9: a node's properties
2198 // (anchor/tag) are prefixes of the node. When an `&`,
2199 // `*`, or `!` is at the start of a line (column ==
2200 // current_indent + 1) and a `: ` follows on the same
2201 // line, the property/alias is part of an implicit
2202 // key's leading position. The block mapping that
2203 // contains this key therefore opens at this column,
2204 // *before* the property/alias token is emitted
2205 // (yaml-test-suite 7BMT, 6BFJ, 9KAX, U3XV, 26DV).
2206 '&' => {
2207 // Mirror H7J7 check for anchors (yaml-test-suite
2208 // G9HC \`seq:\\n&anchor\\n- a\`).
2209 if self.flow_level == 0
2210 && self.position.column == self.current_indent + 1
2211 && !self.check_for_mapping_ahead()
2212 && self.indent_stack.len() > 1
2213 && self.current_indent == self.indent_stack[self.indent_stack.len() - 2]
2214 && self.most_recent_token_is_value_separator()
2215 {
2216 return Err(Error::scan(
2217 self.position,
2218 "Anchor at line-start with insufficient indent for value position"
2219 .to_string(),
2220 ));
2221 }
2222 if self.flow_level == 0
2223 && self.position.column == self.current_indent + 1
2224 && self.check_for_mapping_ahead()
2225 {
2226 self.maybe_open_block_mapping_for_key()?;
2227 }
2228 let token = self.scan_anchor()?;
2229 self.tokens.push(token);
2230 }
2231 '*' => {
2232 // §6.9.2: alias/anchor names may contain \`:\` (only
2233 // flow indicators and whitespace terminate them).
2234 // So \`*a:\` is an alias named \`a:\`, NOT an alias
2235 // \`*a\` followed by a key separator. Don't open
2236 // an implicit block mapping in that case (yaml-
2237 // test-suite 2SXE).
2238 if self.flow_level == 0
2239 && self.position.column == self.current_indent + 1
2240 && self.check_for_mapping_ahead()
2241 && !self.colon_belongs_to_alias_anchor_name()
2242 {
2243 self.maybe_open_block_mapping_for_key()?;
2244 }
2245 let token = self.scan_alias()?;
2246 self.tokens.push(token);
2247 }
2248
2249 // Block scalars
2250 '|' => {
2251 let token = self.scan_literal_block_scalar()?;
2252 self.tokens.push(token);
2253 // Block scalar collection rewinds the cursor to the
2254 // start of the next under-indented line. `current_indent`
2255 // is still set to the inline content's column from the
2256 // enclosing `- |` / `key: |` site, so the next iteration
2257 // would mis-dispatch. Break out so the outer loop
2258 // re-enters `process_line` and reruns indent handling
2259 // (yaml-test-suite 4QFQ, M6YH, P2AD).
2260 break;
2261 }
2262 '>' => {
2263 let token = self.scan_folded_block_scalar()?;
2264 self.tokens.push(token);
2265 break;
2266 }
2267
2268 // Tags. Same line-start property-opens-mapping rule
2269 // (yaml-test-suite ZH7C variants).
2270 //
2271 // §6.9: a property at the SAME indent as the
2272 // enclosing mapping/sequence cannot apply to that
2273 // collection's value — the value must be more
2274 // indented. If we're at a line-start \`!\` whose column
2275 // equals the enclosing mapping's indent + 1 AND that
2276 // mapping currently has a key awaiting a value, the
2277 // tag is misplaced (yaml-test-suite H7J7).
2278 '!' => {
2279 if self.flow_level == 0
2280 && self.position.column == self.current_indent + 1
2281 && !self.check_for_mapping_ahead()
2282 && self.indent_stack.len() > 1
2283 && self.current_indent == self.indent_stack[self.indent_stack.len() - 2]
2284 && self.most_recent_token_is_value_separator()
2285 {
2286 return Err(Error::scan(
2287 self.position,
2288 "Tag at line-start with insufficient indent for value position"
2289 .to_string(),
2290 ));
2291 }
2292 if self.flow_level == 0
2293 && self.position.column == self.current_indent + 1
2294 && self.check_for_mapping_ahead()
2295 {
2296 self.maybe_open_block_mapping_for_key()?;
2297 }
2298 let token = self.scan_tag()?;
2299 self.tokens.push(token);
2300 }
2301
2302 // Plain scalars
2303 _ if self.is_plain_scalar_start() => {
2304 // A plain scalar starting on the SAME line as a
2305 // flow-collection close (\`}\` or \`]\`) means there's
2306 // no separator between the closed flow node and
2307 // the new content (yaml-test-suite 62EZ
2308 // \`x: { y: z }in: valid\`).
2309 if self.flow_level == 0 {
2310 if let Some(last) = self.tokens.last() {
2311 if matches!(
2312 last.token_type,
2313 TokenType::FlowMappingEnd | TokenType::FlowSequenceEnd
2314 ) && last.end_position.line == self.position.line
2315 {
2316 return Err(Error::scan(
2317 self.position,
2318 "Plain scalar immediately after flow collection close"
2319 .to_string(),
2320 ));
2321 }
2322 }
2323 }
2324 if self.flow_level == 0 && self.check_for_mapping_ahead() {
2325 self.maybe_open_block_mapping_for_key()?;
2326 }
2327
2328 let token = self.scan_plain_scalar()?;
2329 self.tokens.push(token);
2330 }
2331
2332 _ => {
2333 let context = ErrorContext::from_input(&self.input, &self.position, 2)
2334 .with_suggestion("Check for valid YAML syntax characters".to_string());
2335 return Err(Error::invalid_character_with_context(
2336 self.position,
2337 ch,
2338 "YAML document",
2339 context,
2340 ));
2341 }
2342 }
2343 }
2344
2345 // Inline sequences (nested \`- -\` on one line) used to be
2346 // closed unconditionally at end-of-line. But a nested sequence
2347 // can span lines (`- - a\n - b\n- c`) — in that case the inner
2348 // sequence must remain open until handle_indentation sees a
2349 // dedent. Reset the inline-sequence counter (so the next line
2350 // is judged on its own merits) but DO NOT emit BlockEnd —
2351 // handle_indentation's indent_stack pop, the end-of-stream
2352 // close at scan_next_token, and the explicit-dedent close at
2353 // handle_indentation's bottom each provide a correct close.
2354 self.inline_sequence_depth = 0;
2355
2356 Ok(())
2357 }
2358
2359 /// Scan the next token lazily
2360 fn scan_next_token(&mut self) -> Result<()> {
2361 if self.done {
2362 return Ok(());
2363 }
2364
2365 // Add stream start token if this is the beginning
2366 if self.tokens.is_empty() {
2367 self.tokens
2368 .push(Token::simple(TokenType::StreamStart, self.position));
2369 return Ok(());
2370 }
2371
2372 // Check if we're at the end of input
2373 if self.current_char.is_none() {
2374 if !self
2375 .tokens
2376 .iter()
2377 .any(|t| matches!(t.token_type, TokenType::StreamEnd))
2378 {
2379 self.tokens
2380 .push(Token::simple(TokenType::StreamEnd, self.position));
2381 }
2382 self.done = true;
2383 return Ok(());
2384 }
2385
2386 // For now, fall back to scanning all tokens at once for the lazy scanner
2387 // This is a simplified implementation - a full streaming parser would
2388 // need more sophisticated state management
2389 let tokens_before = self.tokens.len();
2390 self.scan_all_tokens()?;
2391
2392 // Mark as done after scanning all tokens
2393 if self.tokens.len() == tokens_before {
2394 self.done = true;
2395 }
2396
2397 Ok(())
2398 }
2399
2400 /// Pre-scan all tokens (simplified approach for basic implementation)
2401 fn scan_all_tokens(&mut self) -> Result<()> {
2402 // Only add StreamStart if we don't have it yet
2403 if !self
2404 .tokens
2405 .iter()
2406 .any(|t| matches!(t.token_type, TokenType::StreamStart))
2407 {
2408 self.tokens
2409 .push(Token::simple(TokenType::StreamStart, self.position));
2410 }
2411
2412 while self.current_char.is_some() {
2413 self.process_line()?;
2414
2415 // Advance past newlines
2416 while let Some(ch) = self.current_char {
2417 if ch == '\n' || ch == '\r' {
2418 self.advance();
2419 } else {
2420 break;
2421 }
2422 }
2423 }
2424
2425 // Close any remaining compact sequences (before their parent mappings)
2426 while self.compact_sequence_indents.pop().is_some() {
2427 self.tokens
2428 .push(Token::simple(TokenType::BlockEnd, self.position));
2429 }
2430
2431 // Close any remaining blocks
2432 while self.indent_stack.len() > 1 {
2433 self.indent_stack.pop();
2434 self.indent_is_sequence.pop();
2435 self.tokens
2436 .push(Token::simple(TokenType::BlockEnd, self.position));
2437 }
2438
2439 self.tokens
2440 .push(Token::simple(TokenType::StreamEnd, self.position));
2441 self.done = true;
2442 Ok(())
2443 }
2444
2445 /// Peek at a character at the given offset (can be negative)
2446 /// Check if the current position starts a pure number (digits/dots/minus only,
2447 /// not followed by letters). Values like 500m, 128Mi should be treated as plain scalars.
2448 fn is_pure_number(&self) -> bool {
2449 let mut offset: isize = 0;
2450 let first = self.peek_char(0);
2451 // Skip leading minus
2452 if first == Some('-') {
2453 offset = 1;
2454 }
2455 // Scan digits and at most one dot
2456 let mut has_digit = false;
2457 let mut dot_count = 0;
2458 loop {
2459 match self.peek_char(offset) {
2460 Some(c) if c.is_ascii_digit() => {
2461 has_digit = true;
2462 offset += 1;
2463 }
2464 Some('.') => {
2465 dot_count += 1;
2466 if dot_count > 1 {
2467 // Multiple dots (e.g. 0.5.8) — not a number
2468 return false;
2469 }
2470 offset += 1;
2471 }
2472 Some(c) if c.is_ascii_alphabetic() || c == '_' => {
2473 // Letters follow the digits — not a pure number (e.g. 500m, 128Mi)
2474 return false;
2475 }
2476 Some(c) => {
2477 // For a token to be a pure number, what follows
2478 // the digits must be end-of-token. In flow
2479 // context that's a flow indicator. In block
2480 // context the rest of the line must be pure
2481 // whitespace (possibly trailing a comment) — if
2482 // there's more non-whitespace content on this
2483 // line, the digits are part of a larger plain
2484 // scalar like \`1 - 3\` (yaml-test-suite P76L)
2485 // or \`20:03:20\` (yaml-test-suite U9NS).
2486 if self.flow_level > 0 && ",[]{}".contains(c) {
2487 return has_digit;
2488 }
2489 if c == '\n' || c == '\r' {
2490 return has_digit;
2491 }
2492 if c == ' ' || c == '\t' {
2493 // Look ahead: rest of line must be whitespace
2494 // or a comment.
2495 let mut probe = offset + 1;
2496 loop {
2497 match self.peek_char(probe) {
2498 None => return has_digit,
2499 Some('\n' | '\r') => return has_digit,
2500 Some('#') => return has_digit,
2501 Some(' ' | '\t') => probe += 1,
2502 Some(_) => return false,
2503 }
2504 }
2505 }
2506 if c == ':' {
2507 let next = self.peek_char(offset + 1);
2508 return has_digit && next.map_or(true, |nc| nc.is_whitespace());
2509 }
2510 return false;
2511 }
2512 None => return has_digit,
2513 }
2514 }
2515 }
2516
2517 fn peek_char(&self, offset: isize) -> Option<char> {
2518 // `unsigned_abs()` yields the magnitude as `usize` and is total — it
2519 // is defined even for `isize::MIN`, where `-offset` overflows (panic
2520 // in debug, wrapping UB in release). `checked_add`/`checked_sub` then
2521 // make an out-of-range index a `None` rather than a panic (#20).
2522 let magnitude = offset.unsigned_abs();
2523 let target_index = if offset >= 0 {
2524 self.current_char_index.checked_add(magnitude)?
2525 } else {
2526 self.current_char_index.checked_sub(magnitude)?
2527 };
2528 self.char_cache.get(target_index).copied()
2529 }
2530
2531 /// Scan an anchor token (&name)
2532 fn scan_anchor(&mut self) -> Result<Token> {
2533 let start_pos = self.position;
2534 self.advance(); // Skip '&'
2535
2536 let name = self.scan_identifier()?;
2537 if name.is_empty() {
2538 let context = ErrorContext::from_input(&self.input, &self.position, 2).with_suggestion(
2539 "Provide a valid anchor name after &, e.g., &anchor_name".to_string(),
2540 );
2541 return Err(Error::scan_with_context(
2542 self.position,
2543 "Anchor name cannot be empty",
2544 context,
2545 ));
2546 }
2547
2548 // Track anchor for resource limits
2549 self.resource_tracker.add_anchor(&self.limits)?;
2550
2551 Ok(Token::new(
2552 TokenType::Anchor(name),
2553 start_pos,
2554 self.position,
2555 ))
2556 }
2557
2558 /// Scan an alias token (*name)
2559 fn scan_alias(&mut self) -> Result<Token> {
2560 let start_pos = self.position;
2561 self.advance(); // Skip '*'
2562
2563 let name = self.scan_identifier()?;
2564 if name.is_empty() {
2565 let context = ErrorContext::from_input(&self.input, &self.position, 2).with_suggestion(
2566 "Provide a valid alias name after *, e.g., *alias_name".to_string(),
2567 );
2568 return Err(Error::scan_with_context(
2569 self.position,
2570 "Alias name cannot be empty",
2571 context,
2572 ));
2573 }
2574
2575 Ok(Token::new(TokenType::Alias(name), start_pos, self.position))
2576 }
2577
2578 /// Scan an identifier (used for anchor and alias names)
2579 fn scan_identifier(&mut self) -> Result<String> {
2580 // Per YAML 1.2 §6.9.2 (ns-anchor-name = ns-anchor-char+), the only
2581 // exclusions are whitespace and the flow indicators `,[]{}`. This
2582 // accepts ASCII alphanumeric, underscore, hyphen, AND full unicode
2583 // codepoints (including emoji), matching the spec exactly.
2584 let mut identifier = String::new();
2585 while let Some(ch) = self.current_char {
2586 if ch.is_whitespace() || matches!(ch, ',' | '[' | ']' | '{' | '}') {
2587 break;
2588 }
2589 identifier.push(ch);
2590 // Cap heap growth before an attacker-controlled anchor/alias name
2591 // can exhaust memory: bail the moment it exceeds max_string_length,
2592 // rather than after the full String is materialized (#24).
2593 self.resource_tracker
2594 .check_string_length(&self.limits, identifier.len())?;
2595 self.advance();
2596 }
2597 Ok(identifier)
2598 }
2599
2600 /// Scan a tag token (`!tag`, `!!tag`, or `!<verbatim>`).
2601 fn scan_tag(&mut self) -> Result<Token> {
2602 let start_pos = self.position;
2603 self.advance(); // Skip first '!'
2604
2605 let mut tag = String::from("!");
2606
2607 // Check for verbatim tag format: !<tag>
2608 if self.current_char == Some('<') {
2609 tag.push('<');
2610 self.advance(); // Skip '<'
2611
2612 // Scan until closing '>'
2613 while let Some(ch) = self.current_char {
2614 if ch == '>' {
2615 tag.push(ch);
2616 self.advance();
2617 break;
2618 } else if ch.is_control() || ch.is_whitespace() {
2619 return Err(Error::scan(
2620 self.position,
2621 "Invalid character in verbatim tag".to_string(),
2622 ));
2623 }
2624 tag.push(ch);
2625 self.advance();
2626 }
2627 } else {
2628 // Check for secondary tag handle: !!
2629 if self.current_char == Some('!') {
2630 tag.push('!');
2631 self.advance(); // Skip second '!'
2632 }
2633
2634 // Scan tag name/suffix.
2635 //
2636 // Per YAML 1.2 §5.6, tag suffixes are URI references — they may
2637 // contain any URI character (RFC 3986 unreserved + sub-delims +
2638 // a few others) or `%XX` percent-encoded bytes. The handful of
2639 // characters listed below covers the alphanumeric + URI-safe
2640 // punctuation set used by yaml-test-suite. Percent decoding of
2641 // `%XX` happens later in `TagResolver::resolve`.
2642 //
2643 // §5.3: inside a flow collection, the flow indicators
2644 // `,`, `[`, `]`, `{`, `}` always terminate a node — so we
2645 // must NOT consume them into the tag suffix even though
2646 // RFC 3986 permits them in URIs (yaml-test-suite WZ62).
2647 // YAML 1.2 in practice treats `,` as a flow indicator that
2648 // must be percent-encoded (\`%2C\`) when it appears inside
2649 // a tag suffix — bare \`,\` is not allowed in EITHER block
2650 // or flow context (yaml-test-suite U99R).
2651 while let Some(ch) = self.current_char {
2652 if matches!(ch, ',') {
2653 break;
2654 }
2655 if self.flow_level > 0 && matches!(ch, '[' | ']' | '{' | '}') {
2656 break;
2657 }
2658 // §6.8 / §5.6: `:` IS a valid tag URI character — e.g.
2659 // `tag:yaml.org,2002:str` legitimately contains two
2660 // colons inside its URI. But a `:` followed by
2661 // whitespace, EOL or EOF is the YAML mapping-value
2662 // indicator and MUST terminate the tag, otherwise
2663 // `!handle!suffix: value` is mis-scanned as
2664 // `Tag("!handle!suffix:") Scalar("value")` and the
2665 // implicit-key mapping structure is lost. Mirrors the
2666 // `,` carve-out above (a valid URI char that's also a
2667 // YAML flow indicator in some contexts).
2668 if ch == ':' {
2669 match self.peek_char(1) {
2670 None => break,
2671 Some(c) if c.is_whitespace() => break,
2672 _ => {}
2673 }
2674 }
2675 if ch.is_alphanumeric() || "-._~:/?#[]@!$&'()*+;=%".contains(ch) {
2676 tag.push(ch);
2677 self.advance();
2678 } else {
2679 break;
2680 }
2681 }
2682 }
2683
2684 Ok(Token::new(TokenType::Tag(tag), start_pos, self.position))
2685 }
2686
2687 /// Scan a literal block scalar (|)
2688 fn scan_literal_block_scalar(&mut self) -> Result<Token> {
2689 let start_pos = self.position;
2690 self.advance(); // Skip '|'
2691
2692 // Parse block scalar header (indicators like +, -, explicit indent)
2693 let (chomping, explicit_indent) = self.scan_block_scalar_header()?;
2694
2695 // Skip to next line
2696 self.skip_to_next_line()?;
2697
2698 // Determine indentation. `base_indent` is the surrounding
2699 // block's indent — i.e. the indent of the sequence or
2700 // mapping that contains this scalar. `self.current_indent`
2701 // is sometimes set to the inline indicator column (e.g. 2
2702 // for `- |`), which would make `base_indent + explicit`
2703 // wrong; use the top of `indent_stack` instead
2704 // (yaml-test-suite 4QFQ `|1`).
2705 let base_indent = self.indent_stack.last().copied().unwrap_or(0);
2706 let content_indent = if let Some(explicit) = explicit_indent {
2707 base_indent + explicit
2708 } else {
2709 // Find the first non-empty content line to determine indentation
2710 self.find_block_scalar_indent(base_indent)?
2711 };
2712
2713 // Collect the literal block content
2714 let content = self.collect_literal_block_content(content_indent, chomping)?;
2715
2716 Ok(Token::new(
2717 TokenType::BlockScalarLiteral(content),
2718 start_pos,
2719 self.position,
2720 ))
2721 }
2722
2723 /// Scan a folded block scalar (>)
2724 fn scan_folded_block_scalar(&mut self) -> Result<Token> {
2725 let start_pos = self.position;
2726 self.advance(); // Skip '>'
2727
2728 // Parse block scalar header (indicators like +, -, explicit indent)
2729 let (chomping, explicit_indent) = self.scan_block_scalar_header()?;
2730
2731 // Skip to next line
2732 self.skip_to_next_line()?;
2733
2734 // See scan_literal_block_scalar for why we read `indent_stack`
2735 // rather than `current_indent`.
2736 let base_indent = self.indent_stack.last().copied().unwrap_or(0);
2737 let content_indent = if let Some(explicit) = explicit_indent {
2738 base_indent + explicit
2739 } else {
2740 // Find the first non-empty content line to determine indentation
2741 self.find_block_scalar_indent(base_indent)?
2742 };
2743
2744 // Collect the folded block content
2745 let content = self.collect_folded_block_content(content_indent, chomping)?;
2746
2747 Ok(Token::new(
2748 TokenType::BlockScalarFolded(content),
2749 start_pos,
2750 self.position,
2751 ))
2752 }
2753
2754 /// Parse block scalar header indicators (+, -, and explicit indent)
2755 fn scan_block_scalar_header(&mut self) -> Result<(ChompingMode, Option<usize>)> {
2756 let mut chomping = ChompingMode::Clip;
2757 let mut explicit_indent: Option<usize> = None;
2758 // §6.6: a comment must be preceded by whitespace. \`|#x\` and
2759 // \`>#x\` are invalid (yaml-test-suite X4QW).
2760 let mut seen_separator_ws = false;
2761
2762 // Parse indicators in any order
2763 while let Some(ch) = self.current_char {
2764 match ch {
2765 '+' => {
2766 chomping = ChompingMode::Keep;
2767 self.advance();
2768 }
2769 '-' => {
2770 chomping = ChompingMode::Strip;
2771 self.advance();
2772 }
2773 '0'..='9' => {
2774 let digit = ch.to_digit(10).unwrap() as usize;
2775 if explicit_indent.is_some() {
2776 let context = ErrorContext::from_input(&self.input, &self.position, 2)
2777 .with_suggestion(
2778 "Use only one indent indicator digit in block scalar".to_string(),
2779 );
2780 return Err(Error::scan_with_context(
2781 self.position,
2782 "Multiple indent indicators in block scalar",
2783 context,
2784 ));
2785 }
2786 // YAML 1.2 §8.1.1.1: explicit indent indicator is
2787 // 1..=9. `|0` and `>0` are invalid
2788 // (yaml-test-suite 2G84/00).
2789 if digit == 0 {
2790 let context = ErrorContext::from_input(&self.input, &self.position, 2)
2791 .with_suggestion(
2792 "Block-scalar indent indicator must be 1-9".to_string(),
2793 );
2794 return Err(Error::scan_with_context(
2795 self.position,
2796 "Block-scalar indent indicator `0` is invalid",
2797 context,
2798 ));
2799 }
2800 explicit_indent = Some(digit);
2801 self.advance();
2802 }
2803 ' ' | '\t' => {
2804 seen_separator_ws = true;
2805 self.advance(); // Skip whitespace
2806 }
2807 '#' => {
2808 if !seen_separator_ws {
2809 return Err(Error::scan(
2810 self.position,
2811 "Comment in block-scalar header must be preceded by whitespace"
2812 .to_string(),
2813 ));
2814 }
2815 // Skip comment to end of line
2816 while let Some(ch) = self.current_char {
2817 self.advance();
2818 if ch == '\n' || ch == '\r' {
2819 break;
2820 }
2821 }
2822 break;
2823 }
2824 '\n' | '\r' => break,
2825 _ => {
2826 let context = ErrorContext::from_input(&self.input, &self.position, 2)
2827 .with_suggestion("Use valid block scalar indicators: | (literal), > (folded), + (keep), - (strip), or digit (indent)".to_string());
2828 return Err(Error::invalid_character_with_context(
2829 self.position,
2830 ch,
2831 "block scalar header",
2832 context,
2833 ));
2834 }
2835 }
2836 }
2837
2838 Ok((chomping, explicit_indent))
2839 }
2840
2841 /// Advance the cursor PAST the next line break, but do not consume
2842 /// any leading whitespace on the line that follows. The block-
2843 /// scalar header parser uses this to step from the indicator line
2844 /// to the start of the content line — the next line's leading
2845 /// spaces are part of its content_indent, not header whitespace.
2846 fn skip_to_next_line(&mut self) -> Result<()> {
2847 // If we're already at column 1 (the comment handler in
2848 // scan_block_scalar_header may have already advanced past a
2849 // newline), do nothing — the next line's leading whitespace
2850 // belongs to its content_indent.
2851 if self.position.column == 1 {
2852 return Ok(());
2853 }
2854 while let Some(ch) = self.current_char {
2855 match ch {
2856 '\n' | '\r' => {
2857 self.advance();
2858 return Ok(());
2859 }
2860 ' ' | '\t' => {
2861 self.advance();
2862 }
2863 _ => return Ok(()),
2864 }
2865 }
2866 Ok(())
2867 }
2868
2869 /// Find the content indentation for a block scalar.
2870 ///
2871 /// Per spec §8.1.1.1, indent is the leading-space count of the first
2872 /// non-empty content line (or the longest blank-line indent if no
2873 /// non-empty line exists). A non-empty line whose indent is not
2874 /// strictly deeper than `base_indent` is outside the scalar's
2875 /// scope — that line is a sibling structure, not content
2876 /// (yaml-test-suite K858).
2877 fn find_block_scalar_indent(&mut self, base_indent: usize) -> Result<usize> {
2878 let saved_position = self.position;
2879 let saved_char = self.current_char;
2880 let saved_char_index = self.current_char_index;
2881
2882 let mut max_blank_indent: usize = 0;
2883 let mut found = false;
2884 let mut content_indent: usize = 1;
2885
2886 loop {
2887 let mut line_indent = 0;
2888 while self.current_char == Some(' ') {
2889 line_indent += 1;
2890 self.advance();
2891 }
2892 // §6.1 + §8.1: tabs cannot serve as block-scalar
2893 // indentation. A line that BEGINS with a tab (no leading
2894 // spaces) inside the block scalar's indent search is
2895 // invalid (yaml-test-suite Y79Y/000 \`foo: |\\n\\tbar\`).
2896 // Tabs that appear AFTER one or more spaces are content,
2897 // not indentation, and remain valid (yaml-test-suite
2898 // 96NN/00 \`foo: |-\\n \\tbar\`).
2899 if line_indent == 0 && self.current_char == Some('\t') {
2900 return Err(Error::scan(
2901 self.position,
2902 "Tab cannot serve as block-scalar indentation".to_string(),
2903 ));
2904 }
2905
2906 match self.current_char {
2907 None => {
2908 if line_indent > max_blank_indent {
2909 max_blank_indent = line_indent;
2910 }
2911 break;
2912 }
2913 Some('\n' | '\r') => {
2914 if line_indent > max_blank_indent {
2915 max_blank_indent = line_indent;
2916 }
2917 self.advance();
2918 // fall through to next iteration
2919 }
2920 Some(_) => {
2921 // If we're nested inside another block — either
2922 // via the `indent_stack` (normal mapping/sequence
2923 // open) or `compact_sequence_indents` (a
2924 // compact block sequence at the same indent as
2925 // its parent) — and this candidate line is not
2926 // strictly deeper than base_indent, it's a
2927 // sibling outside the scalar's scope (yaml-test-
2928 // suite K858, P2AD).
2929 let inside_block =
2930 self.indent_stack.len() > 1 || !self.compact_sequence_indents.is_empty();
2931 if inside_block && line_indent <= base_indent {
2932 content_indent = max_blank_indent.max(base_indent + 1);
2933 } else {
2934 content_indent = line_indent;
2935 }
2936 // §8.1.2.1: leading blank lines may not exceed the
2937 // detected content indent — that ambiguity is
2938 // invalid (yaml-test-suite W9L4, S98Z).
2939 if max_blank_indent > content_indent {
2940 self.position = saved_position;
2941 self.current_char = saved_char;
2942 self.current_char_index = saved_char_index;
2943 return Err(Error::scan(
2944 self.position,
2945 "Block scalar leading blank-line indent exceeds content indent"
2946 .to_string(),
2947 ));
2948 }
2949 found = true;
2950 break;
2951 }
2952 }
2953 }
2954
2955 if !found {
2956 content_indent = max_blank_indent;
2957 }
2958
2959 self.position = saved_position;
2960 self.current_char = saved_char;
2961 self.current_char_index = saved_char_index;
2962
2963 Ok(content_indent)
2964 }
2965
2966 /// Count indentation at start of current line
2967 fn count_line_indent(&mut self) -> usize {
2968 let mut indent = 0;
2969 let saved_position = self.position;
2970 let saved_char = self.current_char;
2971 let saved_char_index = self.current_char_index;
2972
2973 while let Some(ch) = self.current_char {
2974 if ch == ' ' {
2975 indent += 1;
2976 self.advance();
2977 } else if ch == '\t' {
2978 indent += 8; // Tab counts as 8 spaces
2979 self.advance();
2980 } else {
2981 break;
2982 }
2983 }
2984
2985 // Restore position
2986 self.position = saved_position;
2987 self.current_char = saved_char;
2988 self.current_char_index = saved_char_index;
2989
2990 indent
2991 }
2992
2993 /// Collect content for a literal block scalar.
2994 ///
2995 /// Each line is preserved with its terminating newline. After collection
2996 /// we apply the chomping mode per spec §8.1.1.2.
2997 fn collect_literal_block_content(
2998 &mut self,
2999 content_indent: usize,
3000 chomping: ChompingMode,
3001 ) -> Result<String> {
3002 let mut content = String::new();
3003
3004 loop {
3005 // Count current line's leading-space indent.
3006 let mut line_indent = 0;
3007 let save_pos = self.position;
3008 let save_ch = self.current_char;
3009 let save_idx = self.current_char_index;
3010 while self.current_char == Some(' ') {
3011 line_indent += 1;
3012 self.advance();
3013 }
3014
3015 let line_is_blank = matches!(self.current_char, Some('\n' | '\r') | None);
3016
3017 if !line_is_blank && line_indent < content_indent {
3018 // Non-empty line with less indent ends the scalar; rewind.
3019 self.position = save_pos;
3020 self.current_char = save_ch;
3021 self.current_char_index = save_idx;
3022 break;
3023 }
3024
3025 // Document marker at line start always ends the scalar,
3026 // regardless of content_indent (allows zero-indented
3027 // block scalars per yaml-test-suite FP8R).
3028 if line_indent == 0 && self.is_doc_marker_here() {
3029 self.position = save_pos;
3030 self.current_char = save_ch;
3031 self.current_char_index = save_idx;
3032 break;
3033 }
3034
3035 if line_is_blank {
3036 // A blank line counts when there's an actual line break
3037 // to consume. EOF after we've consumed some whitespace
3038 // on the trailing line ALSO counts as one final blank
3039 // line (yaml-test-suite JEF9/02: `- |+\n `).
3040 if matches!(self.current_char, Some('\n' | '\r')) {
3041 // Whitespace beyond content_indent is literal content
3042 // even on blank lines (yaml-test-suite 6FWR).
3043 for _ in content_indent..line_indent {
3044 content.push(' ');
3045 }
3046 content.push('\n');
3047 self.advance();
3048 continue;
3049 }
3050 if line_indent > 0 {
3051 for _ in content_indent..line_indent {
3052 content.push(' ');
3053 }
3054 content.push('\n');
3055 }
3056 break;
3057 }
3058
3059 // Content line: we already consumed `line_indent` spaces, but
3060 // only `content_indent` of them belong to indentation. Any
3061 // extra leading spaces are literal content.
3062 let mut line = String::new();
3063 for _ in content_indent..line_indent {
3064 line.push(' ');
3065 }
3066 while let Some(ch) = self.current_char {
3067 if ch == '\n' || ch == '\r' {
3068 self.advance();
3069 break;
3070 }
3071 line.push(ch);
3072 self.advance();
3073 }
3074 content.push_str(&line);
3075 content.push('\n');
3076
3077 if self.current_char.is_none() {
3078 break;
3079 }
3080 }
3081
3082 Ok(apply_chomping(content, chomping))
3083 }
3084
3085 /// Check if cursor is at `---` or `...` followed by whitespace/EOL.
3086 fn is_doc_marker_here(&self) -> bool {
3087 let c0 = self.current_char;
3088 let c1 = self.peek_char(1);
3089 let c2 = self.peek_char(2);
3090 let c3 = self.peek_char(3);
3091 let trailing_ok = c3.map_or(true, |c| c.is_whitespace());
3092 (c0 == Some('-') && c1 == Some('-') && c2 == Some('-') && trailing_ok)
3093 || (c0 == Some('.') && c1 == Some('.') && c2 == Some('.') && trailing_ok)
3094 }
3095
3096 /// Collect content for a folded block scalar.
3097 ///
3098 /// Folding rules (§8.1.3): a sequence of single blank lines between
3099 /// equally-indented non-empty content lines collapses into a single
3100 /// space; runs of blank lines emit `n-1` newlines; more-indented
3101 /// lines preserve their newline boundaries. After collection, apply
3102 /// chomping (§8.1.1.2).
3103 fn collect_folded_block_content(
3104 &mut self,
3105 content_indent: usize,
3106 chomping: ChompingMode,
3107 ) -> Result<String> {
3108 #[derive(Clone, Copy, PartialEq, Eq)]
3109 enum LineKind {
3110 Normal,
3111 MoreIndented,
3112 Empty,
3113 }
3114 struct Line {
3115 text: String,
3116 kind: LineKind,
3117 }
3118
3119 let mut lines: Vec<Line> = Vec::new();
3120
3121 loop {
3122 let mut line_indent = 0;
3123 let save_pos = self.position;
3124 let save_ch = self.current_char;
3125 let save_idx = self.current_char_index;
3126 while self.current_char == Some(' ') {
3127 line_indent += 1;
3128 self.advance();
3129 }
3130
3131 let line_is_blank = matches!(self.current_char, Some('\n' | '\r') | None);
3132
3133 if !line_is_blank && line_indent < content_indent {
3134 self.position = save_pos;
3135 self.current_char = save_ch;
3136 self.current_char_index = save_idx;
3137 break;
3138 }
3139
3140 if line_indent == 0 && self.is_doc_marker_here() {
3141 self.position = save_pos;
3142 self.current_char = save_ch;
3143 self.current_char_index = save_idx;
3144 break;
3145 }
3146
3147 if line_is_blank {
3148 if matches!(self.current_char, Some('\n' | '\r')) {
3149 lines.push(Line {
3150 text: String::new(),
3151 kind: LineKind::Empty,
3152 });
3153 self.advance();
3154 continue;
3155 }
3156 break;
3157 }
3158
3159 // Capture extra-indent leading spaces as part of content.
3160 let mut text = String::new();
3161 for _ in content_indent..line_indent {
3162 text.push(' ');
3163 }
3164 while let Some(ch) = self.current_char {
3165 if ch == '\n' || ch == '\r' {
3166 self.advance();
3167 break;
3168 }
3169 text.push(ch);
3170 self.advance();
3171 }
3172 // §8.1.3.2: "more indented" means the content (after the
3173 // common indent strip) begins with extra whitespace —
3174 // either spaces or tabs (yaml-test-suite MJS9).
3175 let kind = if text.starts_with(' ') || text.starts_with('\t') {
3176 LineKind::MoreIndented
3177 } else {
3178 LineKind::Normal
3179 };
3180 lines.push(Line { text, kind });
3181
3182 if self.current_char.is_none() {
3183 break;
3184 }
3185 }
3186
3187 // Build the folded output.
3188 let mut content = String::new();
3189 let mut idx = 0;
3190 while idx < lines.len() {
3191 let line = &lines[idx];
3192 match line.kind {
3193 LineKind::Normal | LineKind::MoreIndented => {
3194 content.push_str(&line.text);
3195 // Lookahead: count immediately-following empty lines.
3196 let mut j = idx + 1;
3197 let mut empties = 0;
3198 while j < lines.len() && lines[j].kind == LineKind::Empty {
3199 empties += 1;
3200 j += 1;
3201 }
3202 if j < lines.len() {
3203 // Spec §8.1.3.2: folding behaviour depends on
3204 // whether either surrounding content line is
3205 // "more indented" than the content indent.
3206 // - both Normal, 0 empties → fold to space.
3207 // - both Normal, k empties → k newlines (one
3208 // break folded out).
3209 // - any MoreIndented, 0 empties → 1 newline.
3210 // - any MoreIndented, k empties → k+1 newlines
3211 // (every break preserved).
3212 let mi_adjacent = line.kind == LineKind::MoreIndented
3213 || lines[j].kind == LineKind::MoreIndented;
3214 if empties == 0 {
3215 if mi_adjacent {
3216 content.push('\n');
3217 } else {
3218 content.push(' ');
3219 }
3220 } else {
3221 let breaks = if mi_adjacent { empties + 1 } else { empties };
3222 for _ in 0..breaks {
3223 content.push('\n');
3224 }
3225 }
3226 idx = j;
3227 } else {
3228 // End of stream after content (possibly trailing empties).
3229 // Always emit final `\n` for the last content line; extra
3230 // trailing empties contribute additional `\n`s, and chomping
3231 // will trim them later if needed.
3232 content.push('\n');
3233 for _ in 0..empties {
3234 content.push('\n');
3235 }
3236 break;
3237 }
3238 }
3239 LineKind::Empty => {
3240 // Leading empty lines (no preceding content): emit as `\n`s.
3241 content.push('\n');
3242 idx += 1;
3243 }
3244 }
3245 }
3246
3247 Ok(apply_chomping(content, chomping))
3248 }
3249
3250 /// Emit a `BlockMappingStart` token if the current position is the
3251 /// start of an implicit key and no mapping is yet active at this
3252 /// indent level. Shared by plain and quoted scalar dispatch.
3253 fn maybe_open_block_mapping_for_key(&mut self) -> Result<()> {
3254 // Use `unwrap_or(0)` for parity with the indentation module's
3255 // helpers — defends against error-recovery pop paths that could
3256 // leave the stack momentarily empty (#18).
3257 let last_indent = self.indent_stack.last().copied().unwrap_or(0);
3258 let should_start_new_mapping = if self.current_indent > last_indent {
3259 true
3260 } else if self.current_indent == last_indent {
3261 !self.check_active_mapping_at_level(self.current_indent)
3262 } else {
3263 false
3264 };
3265 if should_start_new_mapping {
3266 // §6.1 + §8.22: opening a NEW block mapping at deeper
3267 // indent than the parent only makes sense if the parent
3268 // has a key WITHOUT a value (the new mapping IS that
3269 // value). If the parent's last content is a complete
3270 // (key, value) pair — i.e. the most recent meaningful
3271 // token is a value-position scalar/alias/close — then
3272 // there's no node to host the deeper mapping (yaml-test-
3273 // suite U44R: \`map:\\n key1: q\\n key2: bad\` — key2
3274 // is deeper than key1 but key1's value is already \`q\`).
3275 if self.current_indent > last_indent && last_indent > 0 {
3276 let mut depth = 0i32;
3277 let mut last_meaningful = None;
3278 for t in self.tokens.iter().rev() {
3279 match &t.token_type {
3280 TokenType::BlockEnd => depth += 1,
3281 TokenType::BlockMappingStart | TokenType::BlockSequenceStart => {
3282 if depth == 0 {
3283 break;
3284 }
3285 depth -= 1;
3286 }
3287 TokenType::Anchor(_) | TokenType::Tag(_) => {}
3288 other => {
3289 if depth == 0 {
3290 last_meaningful = Some(other.clone());
3291 break;
3292 }
3293 }
3294 }
3295 }
3296 if matches!(
3297 last_meaningful,
3298 Some(
3299 TokenType::Scalar(..)
3300 | TokenType::Alias(_)
3301 | TokenType::FlowSequenceEnd
3302 | TokenType::FlowMappingEnd
3303 | TokenType::BlockScalarLiteral(..)
3304 | TokenType::BlockScalarFolded(..)
3305 )
3306 ) {
3307 return Err(Error::scan(
3308 self.position,
3309 "Indentation increase has no parent in current mapping/sequence"
3310 .to_string(),
3311 ));
3312 }
3313 }
3314 self.indent_stack.push(self.current_indent);
3315 self.indent_is_sequence.push(false);
3316 self.resource_tracker
3317 .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
3318 self.tokens
3319 .push(Token::simple(TokenType::BlockMappingStart, self.position));
3320 }
3321 Ok(())
3322 }
3323
3324 /// Look ahead on the current line for a `:` that marks a mapping key.
3325 ///
3326 /// Per YAML 1.2 §7.3.3, a plain scalar may contain a `:` that is not
3327 /// followed by whitespace. Only `: ` terminates the scalar. If the
3328 /// line begins with `"` or `'`, the leading quoted scalar's contents
3329 /// are scanned past (including `''` and `\"` escapes) before looking
3330 /// for the `: ` that would make this scalar a key. This handles
3331 /// yaml-test-suite 6H3V (`'foo: bar\': baz'`) and 6SLA.
3332 /// For an alias/anchor at the current position, scan past
3333 /// the `&`/`*` and the name characters; if the FIRST char that
3334 /// would terminate the name is `:`, the colon is PART of the
3335 /// alias/anchor name (yaml-test-suite 2SXE). Returns true in
3336 /// that case so the caller can skip the implicit-key fast-path.
3337 fn colon_belongs_to_alias_anchor_name(&self) -> bool {
3338 // Start after the `&` / `*` introducer.
3339 let mut i = self.current_char_index + 1;
3340 let n = self.char_cache.len();
3341 // Per scan_identifier rules: stop at whitespace or flow indicator.
3342 while i < n {
3343 let c = self.char_cache[i];
3344 if c.is_whitespace() || matches!(c, ',' | '[' | ']' | '{' | '}') {
3345 break;
3346 }
3347 i += 1;
3348 }
3349 // If the next char (or last consumed?) at termination is `:`,
3350 // then the name ended with `:`. Look at the LAST consumed
3351 // char. Actually our scan_identifier accepts `:` as part of
3352 // name — so the colon is already in the name. There's no
3353 // separate "value indicator" colon after.
3354 //
3355 // For the implicit-key fast path to be wrong, we need the
3356 // name to END with `:` (last char of name is `:`).
3357 if i > self.current_char_index + 1 {
3358 let last_name_char = self.char_cache[i - 1];
3359 if last_name_char == ':' {
3360 return true;
3361 }
3362 }
3363 false
3364 }
3365
3366 /// Scan ahead on the current line (the rest of the post-indent
3367 /// content) to determine whether it looks like an implicit
3368 /// mapping key — i.e. has a `: ` separator (or `:` at line end)
3369 /// before any newline.
3370 fn line_after_indent_is_implicit_key(&self) -> bool {
3371 let mut i = self.current_char_index;
3372 let n = self.char_cache.len();
3373 while i < n {
3374 let ch = self.char_cache[i];
3375 if ch == '\n' || ch == '\r' {
3376 return false;
3377 }
3378 if ch == ':' {
3379 let next = self.char_cache.get(i + 1).copied();
3380 if next.is_none() || next.map_or(false, |c| c.is_whitespace()) {
3381 return true;
3382 }
3383 }
3384 i += 1;
3385 }
3386 false
3387 }
3388
3389 /// Walk back through recent tokens; if the last non-property
3390 /// token was `Value` (`:`), the parser is in value-expectation
3391 /// mode (key not yet matched with a value).
3392 fn most_recent_token_is_value_separator(&self) -> bool {
3393 for t in self.tokens.iter().rev() {
3394 match t.token_type {
3395 TokenType::Anchor(_) | TokenType::Tag(_) => {}
3396 TokenType::Value => return true,
3397 _ => return false,
3398 }
3399 }
3400 false
3401 }
3402
3403 fn check_for_mapping_ahead(&self) -> bool {
3404 let mut i = self.current_char_index;
3405 let n = self.char_cache.len();
3406 if i < n {
3407 let first = self.char_cache[i];
3408 if first == '\'' || first == '"' {
3409 let quote = first;
3410 i += 1;
3411 while i < n {
3412 let c = self.char_cache[i];
3413 if c == '\n' || c == '\r' {
3414 return false; // unterminated quote on line
3415 }
3416 if quote == '\'' && c == '\'' && self.char_cache.get(i + 1) == Some(&'\'') {
3417 // `''` is the in-string single-quote escape.
3418 i += 2;
3419 continue;
3420 }
3421 if quote == '"' && c == '\\' {
3422 // Skip the escaped char.
3423 i += 2;
3424 continue;
3425 }
3426 if c == quote {
3427 i += 1;
3428 break;
3429 }
3430 i += 1;
3431 }
3432 }
3433 }
3434 // Skip balanced flow collections — a `:` *inside* `[...]` or
3435 // `{...}` does NOT make the line a block-mapping key (the flow
3436 // collection itself can BE the key, but its inner colons are
3437 // part of its own structure). yaml-test-suite: `{key: v}` is
3438 // a standalone flow mapping; `[a]: outer` is a block-map key.
3439 let mut flow_depth: i32 = 0;
3440 while i < n {
3441 let ch = self.char_cache[i];
3442 match ch {
3443 '\n' | '\r' => return false,
3444 '[' | '{' => flow_depth += 1,
3445 ']' | '}' => flow_depth -= 1,
3446 ':' if flow_depth <= 0 => {
3447 let next = self.char_cache.get(i + 1).copied();
3448 match next {
3449 None => return true,
3450 Some(c) if c.is_whitespace() => return true,
3451 _ => {}
3452 }
3453 }
3454 _ => {}
3455 }
3456 i += 1;
3457 }
3458 false
3459 }
3460
3461 /// Check if there's an active mapping at the specified indentation level
3462 /// This method properly handles BlockEnd tokens by tracking mapping start/end pairs
3463 fn check_active_mapping_at_level(&self, _target_indent: usize) -> bool {
3464 let mut depth = 0;
3465
3466 // Walk backwards through tokens to find the innermost unmatched block start.
3467 // Every BlockEnd increments depth; BlockMappingStart and BlockSequenceStart
3468 // decrement it (both open blocks that need a matching BlockEnd).
3469 // When depth == 0 we have found the block start that is still "open".
3470 for token in self.tokens.iter().rev() {
3471 match &token.token_type {
3472 TokenType::BlockMappingStart => {
3473 if depth == 0 {
3474 // The innermost open block is a mapping — active at this level.
3475 return true;
3476 }
3477 depth -= 1;
3478 }
3479 TokenType::BlockSequenceStart => {
3480 if depth == 0 {
3481 // The innermost open block is a sequence, not a mapping.
3482 return false;
3483 }
3484 depth -= 1;
3485 }
3486 TokenType::BlockEnd => {
3487 depth += 1;
3488 }
3489 TokenType::StreamStart | TokenType::DocumentStart | TokenType::DocumentEnd => {
3490 // Stop at document boundaries
3491 break;
3492 }
3493 _ => {}
3494 }
3495 }
3496
3497 false
3498 }
3499}
3500
3501impl Scanner for BasicScanner {
3502 fn check_token(&self) -> bool {
3503 // For lazy scanning: check if we have cached tokens or can generate more
3504 self.token_index < self.tokens.len() || !self.done
3505 }
3506
3507 fn peek_token(&self) -> Result<Option<&Token>> {
3508 // This is a bit tricky with lazy scanning since peek shouldn't mutate
3509 // For now, return cached token if available
3510 Ok(self.tokens.get(self.token_index))
3511 }
3512
3513 fn get_token(&mut self) -> Result<Option<Token>> {
3514 // If we need more tokens and haven't finished, scan next token
3515 if self.token_index >= self.tokens.len() && !self.done {
3516 self.scan_next_token()?;
3517 }
3518
3519 if self.token_index < self.tokens.len() {
3520 let token = self.tokens[self.token_index].clone();
3521 self.token_index += 1;
3522 Ok(Some(token))
3523 } else {
3524 Ok(None)
3525 }
3526 }
3527
3528 fn reset(&mut self) {
3529 self.token_index = 0;
3530 self.position = Position::start();
3531 self.tokens.clear();
3532 self.done = false;
3533 self.current_char = self.input.chars().next();
3534 self.indent_stack = vec![0];
3535 self.current_indent = 0;
3536 self.flow_level = 0;
3537 self.detected_indent_style = None;
3538 self.indent_samples.clear();
3539 self.previous_indent_level = 0;
3540 self.current_char_index = 0;
3541 self.current_char = self.char_cache.first().copied();
3542 }
3543
3544 fn position(&self) -> Position {
3545 self.position
3546 }
3547
3548 fn input(&self) -> &str {
3549 &self.input
3550 }
3551}
3552
3553#[cfg(test)]
3554mod tests {
3555 use super::*;
3556
3557 /// Regression for #20. peek_char's negative branch must not compute
3558 /// `-offset` on `isize::MIN` — that overflows (panic in debug, wrapping
3559 /// UB in release). An out-of-range backward offset yields `None`.
3560 #[test]
3561 fn peek_char_handles_isize_min_without_overflow() {
3562 let scanner = BasicScanner::new("abc".to_string());
3563 assert_eq!(scanner.peek_char(isize::MIN), None);
3564 }
3565
3566 /// Regression for #20. The public `ScalarScanner::peek_char` takes a
3567 /// `usize`; the `BasicScanner` bridge casts it to `isize`. A `usize`
3568 /// above `isize::MAX` wraps to `isize::MIN` — it must still yield `None`,
3569 /// never a panic.
3570 #[test]
3571 fn scalar_scanner_peek_char_survives_huge_usize_offset() {
3572 let scanner = BasicScanner::new("abc".to_string());
3573 let huge = (isize::MAX as usize) + 1; // casts to isize::MIN
3574 assert_eq!(ScalarScanner::peek_char(&scanner, huge), None);
3575 }
3576
3577 /// Regression for #19. Reaching this constructor with malformed input
3578 /// must record the scanning error so callers can detect failure via
3579 /// `has_scanning_error()`. Previously the result of `scan_all_tokens`
3580 /// was dropped, silently truncating the token stream.
3581 #[test]
3582 fn new_eager_with_comments_propagates_scanning_errors() {
3583 // A doc-start marker inside an unterminated quoted scalar is a
3584 // scanning error (see `Error::scan(... "inside quoted scalar")`).
3585 // First confirm the non-comment constructor reports it — that
3586 // anchors the parity check.
3587 let input = "\"abc\n---\n";
3588 let plain = BasicScanner::new_eager(input.to_string());
3589 assert!(
3590 plain.has_scanning_error(),
3591 "precondition: malformed input must produce a scanning error via new_eager"
3592 );
3593
3594 let with_comments = BasicScanner::new_eager_with_comments(input.to_string());
3595 assert!(
3596 with_comments.has_scanning_error(),
3597 "new_eager_with_comments must NOT silently swallow scanner errors"
3598 );
3599 }
3600
3601 /// Drive the parser pipeline on `input` in a dedicated thread, returning
3602 /// `None` if it doesn't finish within `Duration::from_secs(2)`. Used by
3603 /// regression tests for parser hangs so a still-broken parser doesn't
3604 /// block the whole `cargo test` run.
3605 fn parse_with_timeout(input: &str) -> Option<Vec<crate::parser::Event>> {
3606 use crate::parser::{BasicParser, Parser as ParserTrait};
3607 use std::sync::mpsc;
3608 use std::thread;
3609 use std::time::Duration;
3610
3611 let owned = input.to_string();
3612 let (tx, rx) = mpsc::channel();
3613 thread::spawn(move || {
3614 let mut p = BasicParser::new_eager(owned);
3615 let _ = p.take_scanning_error();
3616 let mut events = Vec::new();
3617 loop {
3618 match p.get_event() {
3619 Ok(Some(ev)) => events.push(ev),
3620 Ok(None) => break,
3621 Err(_) => break,
3622 }
3623 }
3624 let _ = tx.send(events);
3625 });
3626 rx.recv_timeout(Duration::from_secs(2)).ok()
3627 }
3628
3629 /// Regression: `---` directly followed by non-space text used to spin the
3630 /// scanner forever because the `-` match arm at line-start dispatched to
3631 /// `scan_document_start` (which correctly returned None) and then to
3632 /// `is_plain_scalar_start` (which returns false for `-`, so no consumption
3633 /// occurred — outer `while let` re-entered with the same char). Fix:
3634 /// fall through to `scan_plain_scalar` unconditionally when not a doc
3635 /// marker — the guard already ensures the char is non-whitespace.
3636 /// See yaml-test-suite tests 82AN / EXG3.
3637 #[test]
3638 fn three_dashes_directly_followed_by_text_does_not_hang() {
3639 let events = parse_with_timeout("---word1\nword2\n")
3640 .expect("parser hung — `---word1` should not produce an infinite loop");
3641 // We must produce at least one scalar whose value starts with `---`,
3642 // proving that the dashes were consumed as part of a plain scalar
3643 // (not interpreted as a document marker, which would consume them
3644 // separately).
3645 let starts_with_dashes = events.iter().any(|e| {
3646 matches!(&e.event_type,
3647 crate::parser::EventType::Scalar { value, .. } if value.starts_with("---")
3648 )
3649 });
3650 assert!(
3651 starts_with_dashes,
3652 "expected a plain scalar starting with `---`, got events: {events:?}"
3653 );
3654 }
3655
3656 /// YAML 1.2 §7.3.3: `?`, `:`, and `-` may start a plain scalar provided
3657 /// the next character is non-space (and, in flow context, not a flow
3658 /// indicator). The previous `is_plain_scalar_start` unconditionally
3659 /// rejected those three characters, so plain scalars like `?foo`,
3660 /// `:foo`, `-foo` were reported as `Invalid character`.
3661 /// Tracked by yaml-test-suite 2EBW.
3662 #[test]
3663 fn question_mark_followed_by_text_starts_plain_scalar() {
3664 use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3665 let mut p = BasicParser::new_eager("?foo: bar\n".to_string());
3666 assert!(p.take_scanning_error().is_none());
3667 let mut keys = Vec::new();
3668 while let Ok(Some(ev)) = p.get_event() {
3669 if let EventType::Scalar { value, .. } = ev.event_type {
3670 keys.push(value);
3671 }
3672 }
3673 assert_eq!(keys, vec!["?foo", "bar"]);
3674 }
3675
3676 #[test]
3677 fn colon_followed_by_text_starts_plain_scalar() {
3678 use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3679 let mut p = BasicParser::new_eager(":foo: bar\n".to_string());
3680 assert!(p.take_scanning_error().is_none());
3681 let mut keys = Vec::new();
3682 while let Ok(Some(ev)) = p.get_event() {
3683 if let EventType::Scalar { value, .. } = ev.event_type {
3684 keys.push(value);
3685 }
3686 }
3687 assert_eq!(keys, vec![":foo", "bar"]);
3688 }
3689
3690 /// YAML 1.2: every started document must be closed with a DocumentEnd
3691 /// event before StreamEnd. The previous `TokenType::StreamEnd` handler
3692 /// only emitted `-DOC` for `DocumentContent` / `BlockNode` states —
3693 /// the `DocumentStart` state (entered after `---` and a single scalar
3694 /// like `"foo"`) was skipped, dropping the `-DOC` event. Affected by
3695 /// yaml-test-suite 27NA, 2G84/*, 2LFX and several others.
3696 #[test]
3697 fn explicit_doc_with_only_a_scalar_emits_doc_end_before_stream_end() {
3698 use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3699 let mut p = BasicParser::new_eager("---\n\"foo\"\n".to_string());
3700 assert!(p.take_scanning_error().is_none());
3701 let mut kinds = Vec::new();
3702 while let Ok(Some(ev)) = p.get_event() {
3703 kinds.push(match ev.event_type {
3704 EventType::StreamStart => "+STR",
3705 EventType::StreamEnd => "-STR",
3706 EventType::DocumentStart { .. } => "+DOC",
3707 EventType::DocumentEnd { .. } => "-DOC",
3708 EventType::Scalar { .. } => "=VAL",
3709 _ => "?",
3710 });
3711 }
3712 // Critical: -DOC must come before -STR.
3713 let doc_end_idx = kinds.iter().position(|s| *s == "-DOC");
3714 let str_end_idx = kinds.iter().position(|s| *s == "-STR");
3715 assert!(
3716 doc_end_idx.is_some(),
3717 "missing -DOC in event stream: {kinds:?}"
3718 );
3719 assert!(
3720 doc_end_idx < str_end_idx,
3721 "expected -DOC before -STR, got {kinds:?}"
3722 );
3723 }
3724
3725 /// YAML 1.2 §5.7 hex / Unicode escapes in double-quoted strings.
3726 #[test]
3727 fn double_quoted_hex_escapes_decode_to_codepoint() {
3728 use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3729 for (input, expected) in [
3730 (r#""\x41""#, "A"),
3731 (r#""é""#, "é"),
3732 (r#""\U0001F600""#, "\u{1f600}"),
3733 ] {
3734 let mut p = BasicParser::new_eager(input.to_string());
3735 assert!(
3736 p.take_scanning_error().is_none(),
3737 "no scan error for {input}"
3738 );
3739 let mut found = None;
3740 while let Ok(Some(ev)) = p.get_event() {
3741 if let EventType::Scalar { value, .. } = ev.event_type {
3742 found = Some(value);
3743 break;
3744 }
3745 }
3746 assert_eq!(found.as_deref(), Some(expected), "input {input}");
3747 }
3748 }
3749
3750 #[test]
3751 fn truncated_hex_escape_is_a_scan_error() {
3752 use crate::parser::BasicParser;
3753 let mut p = BasicParser::new_eager(r#""\x4""#.to_string());
3754 assert!(
3755 p.take_scanning_error().is_some(),
3756 "truncated \\x escape must error"
3757 );
3758 }
3759
3760 /// YAML 1.2 §5.7: double-quoted strings have a strict allowlist of escape
3761 /// sequences. `\.` (and any other unknown escape) must be reported as a
3762 /// scan error. Tracked by yaml-test-suite 55WF.
3763 #[test]
3764 fn invalid_double_quoted_escape_is_a_scan_error() {
3765 use crate::parser::{BasicParser, Parser as ParserTrait};
3766 let mut p = BasicParser::new_eager("---\n\"\\.\"\n".to_string());
3767 let scan_err = p.take_scanning_error();
3768 let mut parse_err = false;
3769 if scan_err.is_none() {
3770 loop {
3771 match p.get_event() {
3772 Ok(Some(_)) => {}
3773 Ok(None) => break,
3774 Err(_) => {
3775 parse_err = true;
3776 break;
3777 }
3778 }
3779 }
3780 }
3781 assert!(
3782 scan_err.is_some() || parse_err,
3783 "`\\.` is not a valid double-quoted escape and must error"
3784 );
3785 }
3786
3787 /// YAML 1.2: a complex-key marker (`?`) is the first content after an
3788 /// explicit document start (`---`) — it should open an implicit block
3789 /// mapping. The previous parser handled `?` only in
3790 /// `ImplicitDocumentStart` / `DocumentContent` / already-in-mapping
3791 /// states and errored out for `DocumentStart`, breaking inputs like
3792 /// `--- !!set\n? Mark McGwire\n...`. Tracked by yaml-test-suite 2XXW.
3793 #[test]
3794 fn complex_key_directly_after_explicit_doc_start_opens_mapping() {
3795 use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3796 let mut p = BasicParser::new_eager("--- !!set\n? Mark McGwire\n? Sammy Sosa\n".to_string());
3797 assert!(p.take_scanning_error().is_none());
3798 let mut saw_map_start = false;
3799 let mut saw_error = false;
3800 loop {
3801 match p.get_event() {
3802 Ok(Some(ev)) => {
3803 if matches!(ev.event_type, EventType::MappingStart { .. }) {
3804 saw_map_start = true;
3805 }
3806 }
3807 Ok(None) => break,
3808 Err(_) => {
3809 saw_error = true;
3810 break;
3811 }
3812 }
3813 }
3814 assert!(!saw_error, "complex key after `--- !!set` must not error");
3815 assert!(saw_map_start, "expected a MappingStart event");
3816 }
3817
3818 /// YAML 1.2 §6.9.2: anchor / alias names exclude only whitespace and
3819 /// the flow indicators `,[]{}`. Earlier implementations restricted
3820 /// `scan_identifier` to ASCII alphanumeric / `_` / `-`, which rejected
3821 /// valid unicode anchors like `&😁`. Tracked by yaml-test-suite 8XYN.
3822 #[test]
3823 fn anchor_name_may_contain_unicode_symbols() {
3824 use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3825 let mut p = BasicParser::new_eager("---\n- &😁 unicode anchor\n".to_string());
3826 assert!(
3827 p.take_scanning_error().is_none(),
3828 "unicode anchor must not error"
3829 );
3830 let mut anchors = Vec::new();
3831 while let Ok(Some(ev)) = p.get_event() {
3832 if let EventType::Scalar {
3833 anchor: Some(a), ..
3834 } = ev.event_type
3835 {
3836 anchors.push(a);
3837 }
3838 }
3839 assert_eq!(anchors, vec!["😁"]);
3840 }
3841
3842 /// YAML 1.2 §5.6 / RFC 3986 percent-encoding: tag suffixes may contain
3843 /// `%XX` percent-escaped characters, which must be URI-decoded when
3844 /// resolved. The scanner used to reject `%` in tag suffixes as
3845 /// "Invalid character", so e.g. `!e!tag%21 baz` failed before the
3846 /// resolver got a chance to decode it. Tracked by yaml-test-suite 6CK3.
3847 #[test]
3848 fn tag_suffix_with_percent_escape_resolves_to_decoded_uri() {
3849 use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3850 let mut p = BasicParser::new_eager(
3851 "%TAG !e! tag:example.com,2000:app/\n---\n- !e!tag%21 baz\n".to_string(),
3852 );
3853 assert!(
3854 p.take_scanning_error().is_none(),
3855 "tag percent-escapes must not error"
3856 );
3857 let mut tags = Vec::new();
3858 while let Ok(Some(ev)) = p.get_event() {
3859 if let EventType::Scalar { tag: Some(t), .. } = ev.event_type {
3860 tags.push(t);
3861 }
3862 }
3863 assert_eq!(tags, vec!["tag:example.com,2000:app/tag!"]);
3864 }
3865
3866 /// YAML 1.2 §6.8.4: "A YAML processor should ignore any directive it
3867 /// does not recognize." A `%FOO` reserved directive must NOT be treated
3868 /// as a scan error — the directive line is silently skipped and parsing
3869 /// continues. Tracked by yaml-test-suite test 2LFX.
3870 #[test]
3871 fn reserved_directive_is_ignored_not_an_error() {
3872 use crate::parser::{BasicParser, EventType, Parser as ParserTrait};
3873 let mut p = BasicParser::new_eager(
3874 "%FOO bar baz # Should be ignored\n # with a warning.\n---\n\"foo\"\n"
3875 .to_string(),
3876 );
3877 assert!(
3878 p.take_scanning_error().is_none(),
3879 "unknown directives must NOT produce a scan error"
3880 );
3881 let mut scalars = Vec::new();
3882 while let Ok(Some(ev)) = p.get_event() {
3883 if let EventType::Scalar { value, .. } = ev.event_type {
3884 scalars.push(value);
3885 }
3886 }
3887 assert_eq!(scalars, vec!["foo"]);
3888 }
3889
3890 /// Spec requires the two physical lines of `---word1\nword2` to fold into
3891 /// a single plain scalar `"---word1 word2"`. Tracked by yaml-test-suite 82AN.
3892 #[test]
3893 fn three_dashes_followed_by_text_folds_continuation_line() {
3894 let events = parse_with_timeout("---word1\nword2\n").expect("parser hung");
3895 let scalars: Vec<&str> = events
3896 .iter()
3897 .filter_map(|e| match &e.event_type {
3898 crate::parser::EventType::Scalar { value, .. } => Some(value.as_str()),
3899 _ => None,
3900 })
3901 .collect();
3902 assert_eq!(scalars, vec!["---word1 word2"]);
3903 }
3904
3905 /// Regression: tab between block-entry marker and a `-N` value used to
3906 /// hang the scanner via the same `-` match arm. See yaml-test-suite
3907 /// Y79Y/010.
3908 #[test]
3909 fn dash_tab_negative_number_does_not_hang() {
3910 let events = parse_with_timeout("-\t-1\n")
3911 .expect("parser hung — `-\\t-1` should not produce an infinite loop");
3912 assert!(!events.is_empty(), "expected event stream, got none");
3913 }
3914
3915 #[test]
3916 fn test_basic_tokenization() {
3917 let mut scanner = BasicScanner::new("42".to_string());
3918
3919 assert!(scanner.check_token());
3920
3921 // StreamStart
3922 let token = scanner.get_token().unwrap().unwrap();
3923 assert!(matches!(token.token_type, TokenType::StreamStart));
3924
3925 // Number
3926 let token = scanner.get_token().unwrap().unwrap();
3927 if let TokenType::Scalar(value, _) = token.token_type {
3928 assert_eq!(value, "42");
3929 } else {
3930 panic!("Expected scalar token");
3931 }
3932
3933 // StreamEnd
3934 let token = scanner.get_token().unwrap().unwrap();
3935 assert!(matches!(token.token_type, TokenType::StreamEnd));
3936 }
3937
3938 #[test]
3939 fn test_flow_sequence() {
3940 let mut scanner = BasicScanner::new("[1, 2, 3]".to_string());
3941
3942 // StreamStart
3943 scanner.get_token().unwrap();
3944
3945 // [
3946 let token = scanner.get_token().unwrap().unwrap();
3947 assert!(matches!(token.token_type, TokenType::FlowSequenceStart));
3948
3949 // 1
3950 let token = scanner.get_token().unwrap().unwrap();
3951 if let TokenType::Scalar(value, _) = token.token_type {
3952 assert_eq!(value, "1");
3953 }
3954
3955 // ,
3956 let token = scanner.get_token().unwrap().unwrap();
3957 assert!(matches!(token.token_type, TokenType::FlowEntry));
3958 }
3959
3960 #[test]
3961 fn test_quoted_strings() {
3962 let mut scanner = BasicScanner::new(r#""hello world""#.to_string());
3963
3964 // StreamStart
3965 scanner.get_token().unwrap();
3966
3967 // Quoted string
3968 let token = scanner.get_token().unwrap().unwrap();
3969 if let TokenType::Scalar(value, _) = token.token_type {
3970 assert_eq!(value, "hello world");
3971 } else {
3972 panic!("Expected scalar token");
3973 }
3974 }
3975
3976 #[test]
3977 fn test_comment_handling() {
3978 let input = r"
3979# Full line comment
3980key: value # End of line comment
3981# Another comment
3982data: test
3983";
3984 let mut scanner = BasicScanner::new(input.to_string());
3985
3986 let mut tokens = Vec::new();
3987 while let Ok(Some(token)) = scanner.get_token() {
3988 tokens.push(token);
3989 }
3990
3991 // Should only contain YAML structure tokens, no comment tokens
3992 let scalar_values: Vec<String> = tokens
3993 .iter()
3994 .filter_map(|t| match &t.token_type {
3995 TokenType::Scalar(s, _) => Some(s.clone()),
3996 _ => None,
3997 })
3998 .collect();
3999
4000 assert_eq!(scalar_values, vec!["key", "value", "data", "test"]);
4001
4002 // Should not contain any comment tokens
4003 assert!(
4004 !tokens
4005 .iter()
4006 .any(|t| matches!(t.token_type, TokenType::Comment(_)))
4007 );
4008 }
4009
4010 #[test]
4011 fn test_hash_in_strings() {
4012 let input = r#"
4013string1: "This has a # character"
4014string2: 'Also has # character'
4015normal: value # This is a comment
4016"#;
4017 let mut scanner = BasicScanner::new(input.to_string());
4018
4019 let mut scalar_values = Vec::new();
4020 while let Ok(Some(token)) = scanner.get_token() {
4021 if let TokenType::Scalar(value, _) = token.token_type {
4022 scalar_values.push(value);
4023 }
4024 }
4025
4026 assert!(scalar_values.contains(&"This has a # character".to_string()));
4027 assert!(scalar_values.contains(&"Also has # character".to_string()));
4028 assert!(scalar_values.contains(&"value".to_string()));
4029 assert!(
4030 !scalar_values
4031 .iter()
4032 .any(|s| s.contains("This is a comment"))
4033 );
4034 }
4035
4036 #[test]
4037 fn test_escape_sequences() {
4038 // YAML 1.2 §5.7 double-quoted escape sequences. Single-quoted strings
4039 // have NO backslash escapes — `''` is the only escape — so this set
4040 // is restricted to the double-quoted cases.
4041 let test_cases = vec![
4042 (r#""Line 1\nLine 2""#, "Line 1\nLine 2"),
4043 (r#""Col1\tCol2""#, "Col1\tCol2"),
4044 (r#""First\rSecond""#, "First\rSecond"),
4045 (r#""Path\\to\\file""#, "Path\\to\\file"),
4046 (r#""He said \"Hello\"""#, "He said \"Hello\""),
4047 ];
4048
4049 for (input, expected) in test_cases {
4050 let mut scanner = BasicScanner::new(input.to_string());
4051 scanner.get_token().unwrap(); // Skip StreamStart
4052
4053 if let Ok(Some(token)) = scanner.get_token() {
4054 if let TokenType::Scalar(value, _) = token.token_type {
4055 assert_eq!(value, expected, "Failed for input: {}", input);
4056 } else {
4057 panic!("Expected scalar token for input: {}", input);
4058 }
4059 } else {
4060 panic!("Failed to get token for input: {}", input);
4061 }
4062 }
4063 }
4064
4065 #[test]
4066 fn test_extended_yaml_escapes() {
4067 // Test additional YAML escape sequences
4068 let test_cases = vec![
4069 (r#""\0""#, "\0"), // null character
4070 (r#""\a""#, "\x07"), // bell
4071 (r#""\b""#, "\x08"), // backspace
4072 (r#""\f""#, "\x0C"), // form feed
4073 (r#""\v""#, "\x0B"), // vertical tab
4074 (r#""\e""#, "\x1B"), // escape
4075 (r#""\ ""#, " "), // literal space
4076 (r#""\/""#, "/"), // literal forward slash
4077 ];
4078
4079 for (input, expected) in test_cases {
4080 let mut scanner = BasicScanner::new(input.to_string());
4081 scanner.get_token().unwrap(); // Skip StreamStart
4082
4083 if let Ok(Some(token)) = scanner.get_token() {
4084 if let TokenType::Scalar(value, _) = token.token_type {
4085 assert_eq!(value, expected, "Failed for input: {}", input);
4086 } else {
4087 panic!("Expected scalar token for input: {}", input);
4088 }
4089 } else {
4090 panic!("Failed to get token for input: {}", input);
4091 }
4092 }
4093 }
4094
4095 #[test]
4096 fn test_unknown_escape_sequences() {
4097 // YAML 1.2 §5.7: unknown double-quoted escapes are scan errors, not
4098 // preserved literals. (Earlier versions of this scanner kept the
4099 // backslash + char verbatim — see commit history.)
4100 for input in [r#""\z""#, r#""\q""#, r#""\8""#] {
4101 let mut scanner = BasicScanner::new(input.to_string());
4102 scanner.get_token().unwrap(); // StreamStart
4103 assert!(
4104 scanner.get_token().is_err(),
4105 "expected scan error for invalid escape in {input}"
4106 );
4107 }
4108 }
4109}