rust_yaml/scanner/
mod.rs

1//! YAML scanner for tokenization
2
3use crate::{error::ErrorContext, Error, Limits, Position, ResourceTracker, Result};
4
5pub mod indentation;
6pub mod scalar_scanner;
7pub mod state;
8pub mod token_processor;
9pub mod tokens;
10// pub mod optimizations; // Temporarily disabled
11pub use scalar_scanner::ScalarScanner;
12pub use tokens::*;
13// pub use optimizations::*;
14
15/// Trait for YAML scanners that convert character streams to tokens
16pub trait Scanner {
17    /// Check if there are more tokens available
18    fn check_token(&self) -> bool;
19
20    /// Peek at the next token without consuming it
21    fn peek_token(&self) -> Result<Option<&Token>>;
22
23    /// Get the next token, consuming it
24    fn get_token(&mut self) -> Result<Option<Token>>;
25
26    /// Reset the scanner state
27    fn reset(&mut self);
28
29    /// Get the current position in the input
30    fn position(&self) -> Position;
31
32    /// Get the input text for error reporting
33    fn input(&self) -> &str;
34}
35
36/// A basic scanner implementation for YAML tokenization
37#[derive(Debug)]
38#[allow(dead_code)]
39pub struct BasicScanner {
40    input: String,
41    position: Position,
42    current_char: Option<char>,
43    tokens: Vec<Token>,
44    token_index: usize,
45    done: bool,
46    indent_stack: Vec<usize>,
47    current_indent: usize,
48    allow_simple_key: bool,
49    simple_key_allowed: bool,
50    flow_level: usize,
51    preserve_comments: bool,
52    // Indentation style detection
53    detected_indent_style: Option<crate::value::IndentStyle>,
54    indent_samples: Vec<(usize, bool)>, // (size, is_tabs)
55    previous_indent_level: usize,       // Track the previous indentation for style detection
56    // Performance optimizations
57    buffer: String,                   // Reusable string buffer for token values
58    char_cache: Vec<char>,            // Cached characters for faster access
59    char_indices: Vec<(usize, char)>, // Cached character indices for O(1) lookups
60    current_char_index: usize,        // Current index in char_cache
61    profiler: Option<crate::profiling::YamlProfiler>, // Optional profiling
62    // Error tracking
63    scanning_error: Option<Error>, // Store scanning errors for later retrieval
64    // Resource tracking
65    limits: Limits,
66    resource_tracker: ResourceTracker,
67    // Track inline nested sequences that need closing
68    inline_sequence_depth: usize,
69}
70
71impl BasicScanner {
72    /// Create a new scanner from input string
73    pub fn new(input: String) -> Self {
74        Self::with_limits(input, Limits::default())
75    }
76
77    /// Create a new scanner with custom resource limits
78    pub fn with_limits(input: String, limits: Limits) -> Self {
79        let char_cache: Vec<char> = input.chars().collect();
80        let char_indices: Vec<(usize, char)> = input.char_indices().collect();
81        let current_char = char_cache.first().copied();
82
83        // Track document size for resource limits
84        let mut resource_tracker = ResourceTracker::new();
85        if let Err(e) = resource_tracker.add_bytes(&limits, input.len()) {
86            // If the input is too large, create scanner with error state
87            return Self {
88                current_char: None,
89                input,
90                position: Position::start(),
91                tokens: Vec::new(),
92                token_index: 0,
93                done: true,
94                indent_stack: vec![0],
95                current_indent: 0,
96                allow_simple_key: false,
97                simple_key_allowed: false,
98                flow_level: 0,
99                preserve_comments: false,
100                detected_indent_style: None,
101                indent_samples: Vec::new(),
102                previous_indent_level: 0,
103                buffer: String::new(),
104                char_cache: Vec::new(),
105                char_indices: Vec::new(),
106                current_char_index: 0,
107                profiler: None,
108                scanning_error: Some(e),
109                limits,
110                resource_tracker,
111                inline_sequence_depth: 0,
112            };
113        }
114
115        Self {
116            current_char,
117            input,
118            position: Position::start(),
119            tokens: Vec::new(),
120            token_index: 0,
121            done: false,
122            indent_stack: vec![0], // Always start with base indentation
123            current_indent: 0,
124            allow_simple_key: true,
125            simple_key_allowed: true,
126            flow_level: 0,
127            preserve_comments: false,
128            detected_indent_style: None,
129            indent_samples: Vec::new(),
130            previous_indent_level: 0,
131            buffer: String::with_capacity(64), // Pre-allocate buffer
132            char_cache,
133            char_indices,
134            current_char_index: 0,
135            profiler: std::env::var("RUST_YAML_PROFILE")
136                .ok()
137                .map(|_| crate::profiling::YamlProfiler::new()),
138            scanning_error: None,
139            limits,
140            resource_tracker,
141            inline_sequence_depth: 0,
142        }
143    }
144
145    /// Create a new scanner with eager token scanning (for compatibility)
146    pub fn new_eager(input: String) -> Self {
147        Self::new_eager_with_limits(input, Limits::default())
148    }
149
150    /// Create a new scanner with eager token scanning and custom limits
151    pub fn new_eager_with_limits(input: String, limits: Limits) -> Self {
152        let mut scanner = Self::with_limits(input, limits);
153        // Store any scanning errors for later retrieval
154        if let Err(error) = scanner.scan_all_tokens() {
155            scanner.scanning_error = Some(error);
156        }
157        scanner
158    }
159
160    /// Create a new scanner with comment preservation enabled
161    pub fn new_with_comments(input: String) -> Self {
162        let mut scanner = Self::new(input);
163        scanner.preserve_comments = true;
164        scanner
165    }
166
167    /// Create a new scanner with comments and custom limits
168    pub fn new_with_comments_and_limits(input: String, limits: Limits) -> Self {
169        let mut scanner = Self::with_limits(input, limits);
170        scanner.preserve_comments = true;
171        scanner
172    }
173
174    /// Create a new scanner with eager scanning and comment preservation
175    pub fn new_eager_with_comments(input: String) -> Self {
176        let mut scanner = Self::new_with_comments(input);
177        scanner.scan_all_tokens().unwrap_or(());
178        scanner
179    }
180
181    /// Get the detected indentation style from the document
182    pub const fn detected_indent_style(&self) -> Option<&crate::value::IndentStyle> {
183        self.detected_indent_style.as_ref()
184    }
185
186    /// Check if there was a scanning error
187    pub const fn has_scanning_error(&self) -> bool {
188        self.scanning_error.is_some()
189    }
190
191    /// Get the scanning error if any
192    #[allow(clippy::missing_const_for_fn)]
193    pub fn take_scanning_error(&mut self) -> Option<Error> {
194        self.scanning_error.take()
195    }
196
197    /// Advance to the next character
198    fn advance(&mut self) -> Option<char> {
199        if let Some(ch) = self.current_char {
200            self.position = self.position.advance(ch);
201            self.current_char_index += 1;
202
203            if self.current_char_index < self.char_cache.len() {
204                self.current_char = Some(self.char_cache[self.current_char_index]);
205            } else {
206                self.current_char = None;
207            }
208        }
209
210        self.current_char
211    }
212
213    /// Skip whitespace characters (excluding newlines)
214    fn skip_whitespace(&mut self) {
215        while let Some(ch) = self.current_char {
216            if ch == ' ' || ch == '\t' {
217                self.advance();
218            } else {
219                break;
220            }
221        }
222    }
223
224    /// Handle indentation and produce block tokens if necessary
225    fn handle_indentation(&mut self) -> Result<()> {
226        // Only handle indentation in block context (flow_level == 0)
227        if self.flow_level > 0 {
228            return Ok(());
229        }
230
231        let line_start_pos = self.position;
232        let mut indent = 0;
233        let mut has_tabs = false;
234        let mut has_spaces = false;
235        let _indent_start_pos = self.position;
236
237        // Count indentation and detect style
238        while let Some(ch) = self.current_char {
239            if ch == ' ' {
240                indent += 1;
241                has_spaces = true;
242                self.advance();
243            } else if ch == '\t' {
244                indent += 8; // Tab counts as 8 spaces for indentation calculation
245                has_tabs = true;
246                self.advance();
247            } else {
248                break;
249            }
250        }
251
252        // Analyze indentation pattern for style detection
253        // Only analyze if there's actual content after the indentation (not just whitespace)
254        if indent > 0
255            && self.current_char.is_some()
256            && !matches!(self.current_char, Some('\n' | '\r'))
257        {
258            self.analyze_indentation_pattern(indent, has_tabs, has_spaces)?;
259        }
260
261        // Perform strict indentation validation if we have established a style
262        if let Some(crate::value::IndentStyle::Spaces(width)) = self.detected_indent_style {
263            if indent > 0 && indent % width != 0 {
264                // Check if this is a valid nested level or inconsistent indentation
265                let is_valid_nesting = self.is_valid_indentation_level(indent);
266                if !is_valid_nesting {
267                    let lower_level = (indent / width) * width;
268                    let higher_level = lower_level + width;
269                    let suggestion = format!(
270                        "Inconsistent indentation detected. Expected multiples of {} spaces. Use {} or {} spaces instead of {}",
271                        width, lower_level, higher_level, indent
272                    );
273                    let context =
274                        crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
275                            .with_suggestion(suggestion);
276                    return Err(Error::indentation_with_context(
277                        self.position,
278                        lower_level,
279                        indent,
280                        context,
281                    ));
282                }
283            }
284        }
285
286        // Update previous indentation level for future comparisons
287        if indent > 0 {
288            self.previous_indent_level = indent;
289        }
290
291        // Update current indentation level
292        self.current_indent = indent;
293
294        // Check if we need to emit block end tokens for decreased indentation
295        while let Some(&last_indent) = self.indent_stack.last() {
296            if indent < last_indent && last_indent > 0 {
297                self.indent_stack.pop();
298                self.tokens
299                    .push(Token::simple(TokenType::BlockEnd, line_start_pos));
300            } else {
301                break;
302            }
303        }
304
305        Ok(())
306    }
307
308    /// Analyze indentation pattern to detect the document's indentation style
309    fn analyze_indentation_pattern(
310        &mut self,
311        current_indent: usize,
312        has_tabs: bool,
313        has_spaces: bool,
314    ) -> Result<()> {
315        // Prevent mixed indentation (tabs + spaces on same line)
316        if has_tabs && has_spaces {
317            let context = crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
318                .with_suggestion("Use either tabs OR spaces for indentation, not both".to_string());
319            return Err(Error::invalid_character_with_context(
320                self.position,
321                '\t',
322                "mixed indentation",
323                context,
324            ));
325        }
326
327        // If we detected tabs, check for mixed indentation across lines
328        if has_tabs {
329            match self.detected_indent_style {
330                None => {
331                    // First time detecting indentation style - set to tabs
332                    self.detected_indent_style = Some(crate::value::IndentStyle::Tabs);
333                }
334                Some(crate::value::IndentStyle::Spaces(_)) => {
335                    // Previously detected spaces, now seeing tabs - mixed indentation error
336                    let context =
337                        crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
338                            .with_suggestion(
339                                "Use consistent indentation style throughout the document"
340                                    .to_string(),
341                            );
342                    return Err(Error::invalid_character_with_context(
343                        self.position,
344                        '\t',
345                        "mixed indentation",
346                        context,
347                    ));
348                }
349                Some(crate::value::IndentStyle::Tabs) => {
350                    // Already using tabs - this is consistent
351                }
352            }
353            return Ok(());
354        }
355
356        // For spaces, check for mixed indentation across lines first
357        if has_spaces {
358            // Check if we previously detected tabs
359            if matches!(
360                self.detected_indent_style,
361                Some(crate::value::IndentStyle::Tabs)
362            ) {
363                let context =
364                    crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
365                        .with_suggestion(
366                            "Use consistent indentation style throughout the document".to_string(),
367                        );
368                return Err(Error::invalid_character_with_context(
369                    self.position,
370                    ' ',
371                    "mixed indentation",
372                    context,
373                ));
374            }
375
376            // Calculate the indentation level difference
377            if current_indent > self.previous_indent_level {
378                let indent_diff = current_indent - self.previous_indent_level;
379
380                // Store this sample for analysis (but only meaningful differences)
381                if indent_diff > 0 && indent_diff <= 8 {
382                    // Reasonable indentation range
383                    self.indent_samples.push((indent_diff, false));
384
385                    // Try to determine consistent indentation width
386                    if self.detected_indent_style.is_none() {
387                        self.detect_space_indentation_width();
388                    }
389                }
390            }
391
392            // Validate indentation consistency if we already have a detected style
393            self.validate_indentation_consistency(current_indent)?;
394        }
395
396        Ok(())
397    }
398
399    /// Detect the consistent space indentation width from samples
400    fn detect_space_indentation_width(&mut self) {
401        if self.indent_samples.is_empty() {
402            return; // Need at least 1 sample
403        }
404
405        // Find the most common indentation width
406        let mut width_counts = std::collections::HashMap::new();
407
408        for &(width, is_tabs) in &self.indent_samples {
409            if !is_tabs && width > 0 {
410                *width_counts.entry(width).or_insert(0) += 1;
411            }
412        }
413
414        // Find the most frequent width - be more aggressive and detect early
415        if let Some((&most_common_width, &_count)) =
416            width_counts.iter().max_by_key(|&(_, count)| count)
417        {
418            // Set on first consistent sample to enable stricter validation
419            self.detected_indent_style = Some(crate::value::IndentStyle::Spaces(most_common_width));
420        }
421    }
422
423    /// Check if the given indentation level is valid based on current context
424    #[allow(clippy::missing_const_for_fn)] // Cannot be const due to self.detected_indent_style access
425    fn is_valid_indentation_level(&self, indent: usize) -> bool {
426        // For now, allow any indentation that could represent valid nesting
427        // In the future, this could be made more strict by checking against
428        // the current indent_stack to ensure proper nesting
429        if let Some(crate::value::IndentStyle::Spaces(width)) = self.detected_indent_style {
430            // Must be a multiple of the detected width
431            indent % width == 0
432        } else {
433            // If no style detected yet, allow any indentation
434            true
435        }
436    }
437
438    /// Validate that current indentation is consistent with detected style
439    fn validate_indentation_consistency(&self, current_indent: usize) -> Result<()> {
440        if let Some(crate::value::IndentStyle::Spaces(width)) = self.detected_indent_style {
441            // Check if current indentation is a multiple of the detected width
442            if current_indent > 0 && current_indent % width != 0 {
443                let lower_level = (current_indent / width) * width;
444                let higher_level = lower_level + width;
445                let suggestion = format!(
446                    "Expected indentation to be a multiple of {} spaces. Use {} or {} spaces instead of {}",
447                    width, lower_level, higher_level, current_indent
448                );
449                let context =
450                    crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
451                        .with_suggestion(suggestion);
452                return Err(Error::indentation_with_context(
453                    self.position,
454                    (current_indent / width) * width, // expected (nearest valid level)
455                    current_indent,                   // found
456                    context,
457                ));
458            }
459        }
460        Ok(())
461    }
462
463    /// Check if current position starts a plain scalar
464    fn is_plain_scalar_start(&self) -> bool {
465        self.current_char.map_or(false, |ch| match ch {
466            '-' | '?' | ':' | ',' | '[' | ']' | '{' | '}' | '#' | '&' | '*' | '!' | '|' | '>'
467            | '\'' | '"' | '%' | '@' | '`' => false,
468            _ => !ch.is_whitespace(),
469        })
470    }
471
472    /// Check if the value is a YAML boolean
473    fn is_yaml_bool(value: &str) -> bool {
474        matches!(
475            value,
476            "true"
477                | "false"
478                | "True"
479                | "False"
480                | "TRUE"
481                | "FALSE"
482                | "yes"
483                | "no"
484                | "Yes"
485                | "No"
486                | "YES"
487                | "NO"
488                | "on"
489                | "off"
490                | "On"
491                | "Off"
492                | "ON"
493                | "OFF"
494        )
495    }
496
497    /// Check if the value is a YAML null
498    fn is_yaml_null(value: &str) -> bool {
499        matches!(value, "null" | "Null" | "NULL" | "~" | "")
500    }
501
502    /// Normalize a scalar value based on YAML rules
503    fn normalize_scalar(value: String) -> String {
504        if Self::is_yaml_bool(&value) {
505            // Normalize booleans to lowercase
506            match value.to_lowercase().as_str() {
507                "true" | "yes" | "on" => "true".to_string(),
508                "false" | "no" | "off" => "false".to_string(),
509                _ => value,
510            }
511        } else if Self::is_yaml_null(&value) {
512            // Normalize nulls to empty string (will be handled by parser)
513            "null".to_string()
514        } else {
515            value
516        }
517    }
518
519    /// Scan a number token
520    fn scan_number(&mut self) -> Result<Token> {
521        let start_pos = self.position;
522        let mut value = String::new();
523
524        // Handle negative numbers
525        if self.current_char == Some('-') {
526            value.push('-');
527            self.advance();
528        }
529
530        // Scan digits
531        while let Some(ch) = self.current_char {
532            if ch.is_ascii_digit() {
533                value.push(ch);
534                self.advance();
535            } else if ch == '.' {
536                value.push(ch);
537                self.advance();
538                // Scan fractional part
539                while let Some(ch) = self.current_char {
540                    if ch.is_ascii_digit() {
541                        value.push(ch);
542                        self.advance();
543                    } else {
544                        break;
545                    }
546                }
547                break;
548            } else {
549                break;
550            }
551        }
552
553        Ok(Token::new(
554            TokenType::Scalar(value, tokens::QuoteStyle::Plain),
555            start_pos,
556            self.position,
557        ))
558    }
559
560    /// Scan a plain scalar (unquoted string)
561    fn scan_plain_scalar(&mut self) -> Result<Token> {
562        let start_pos = self.position;
563        let mut value = String::new();
564
565        while let Some(ch) = self.current_char {
566            // Stop at structural characters in block context
567            if self.flow_level == 0 {
568                match ch {
569                    '\n' | '\r' => break,
570                    ':' if self.peek_char(1).map_or(true, |c| c.is_whitespace()) => break,
571                    '#' if value.is_empty()
572                        || self.peek_char(-1).map_or(false, |c| c.is_whitespace()) =>
573                    {
574                        break;
575                    }
576                    _ => {}
577                }
578            } else {
579                // In flow context, stop at flow indicators
580                match ch {
581                    ',' | '[' | ']' | '{' | '}' => break,
582                    ':' if self
583                        .peek_char(1)
584                        .map_or(true, |c| c.is_whitespace() || "]}".contains(c)) =>
585                    {
586                        break;
587                    }
588                    '#' if value.is_empty()
589                        || self.peek_char(-1).map_or(false, |c| c.is_whitespace()) =>
590                    {
591                        break;
592                    }
593                    _ => {}
594                }
595            }
596
597            value.push(ch);
598            self.advance();
599        }
600
601        // Check string length limit
602        self.resource_tracker
603            .check_string_length(&self.limits, value.len())?;
604
605        // Trim trailing whitespace from plain scalars
606        let value = value.trim_end().to_string();
607        let normalized_value = Self::normalize_scalar(value);
608
609        Ok(Token::new(
610            TokenType::Scalar(normalized_value, tokens::QuoteStyle::Plain),
611            start_pos,
612            self.position,
613        ))
614    }
615
616    /// Scan a quoted string
617    fn scan_quoted_string(&mut self, quote_char: char) -> Result<Token> {
618        let start_pos = self.position;
619        let mut value = String::new();
620
621        // Determine quote style based on quote character
622        let quote_style = match quote_char {
623            '\'' => tokens::QuoteStyle::Single,
624            '"' => tokens::QuoteStyle::Double,
625            _ => tokens::QuoteStyle::Plain,
626        };
627
628        self.advance(); // Skip opening quote
629
630        while let Some(ch) = self.current_char {
631            if ch == quote_char {
632                self.advance(); // Skip closing quote
633                break;
634            } else if ch == '\\' {
635                self.advance();
636                if let Some(escaped) = self.current_char {
637                    match escaped {
638                        // Standard C-style escapes
639                        'n' => value.push('\n'),  // newline
640                        't' => value.push('\t'),  // tab
641                        'r' => value.push('\r'),  // carriage return
642                        '\\' => value.push('\\'), // literal backslash
643                        '\'' => value.push('\''), // single quote
644                        '"' => value.push('"'),   // double quote
645
646                        // Additional YAML escapes
647                        '0' => value.push('\0'),   // null character
648                        'a' => value.push('\x07'), // bell character
649                        'b' => value.push('\x08'), // backspace
650                        'f' => value.push('\x0C'), // form feed
651                        'v' => value.push('\x0B'), // vertical tab
652                        'e' => value.push('\x1B'), // escape character
653                        ' ' => value.push(' '),    // literal space
654                        '/' => value.push('/'),    // literal forward slash
655
656                        // For unknown escapes, preserve them literally (YAML spec behavior)
657                        _ => {
658                            value.push('\\');
659                            value.push(escaped);
660                        }
661                    }
662                    self.advance();
663                }
664            } else {
665                value.push(ch);
666                self.advance();
667
668                // Check string length periodically to fail fast
669                if value.len() > self.limits.max_string_length {
670                    return Err(Error::limit_exceeded(format!(
671                        "String length {} exceeds maximum {}",
672                        value.len(),
673                        self.limits.max_string_length
674                    )));
675                }
676            }
677        }
678
679        // Check string length limit
680        self.resource_tracker
681            .check_string_length(&self.limits, value.len())?;
682
683        Ok(Token::new(
684            TokenType::Scalar(value, quote_style),
685            start_pos,
686            self.position,
687        ))
688    }
689
690    /// Scan document start marker (---)
691    fn scan_document_start(&mut self) -> Result<Option<Token>> {
692        if self.current_char == Some('-')
693            && self.peek_char(1) == Some('-')
694            && self.peek_char(2) == Some('-')
695            && self.peek_char(3).map_or(true, |c| c.is_whitespace())
696        {
697            let start_pos = self.position;
698            self.advance(); // -
699            self.advance(); // -
700            self.advance(); // -
701
702            Ok(Some(Token::new(
703                TokenType::DocumentStart,
704                start_pos,
705                self.position,
706            )))
707        } else {
708            Ok(None)
709        }
710    }
711
712    /// Scan YAML version directive (%YAML)
713    fn scan_yaml_directive(&mut self) -> Result<Option<Token>> {
714        if self.current_char != Some('%') {
715            return Ok(None);
716        }
717
718        let start_pos = self.position;
719        let saved_position = self.position;
720        self.advance(); // Skip '%'
721
722        // Check for "YAML"
723        if self.current_char == Some('Y')
724            && self.peek_char(1) == Some('A')
725            && self.peek_char(2) == Some('M')
726            && self.peek_char(3) == Some('L')
727            && self.peek_char(4).map_or(false, |c| c.is_whitespace())
728        {
729            self.advance(); // Y
730            self.advance(); // A
731            self.advance(); // M
732            self.advance(); // L
733
734            // Skip whitespace
735            self.skip_whitespace();
736
737            // Parse version number (e.g., "1.2")
738            let major = if let Some(ch) = self.current_char {
739                if ch.is_ascii_digit() {
740                    let digit = ch.to_digit(10).unwrap() as u8;
741                    self.advance();
742                    digit
743                } else {
744                    return Err(Error::scan(
745                        self.position,
746                        "Expected major version number after %YAML".to_string(),
747                    ));
748                }
749            } else {
750                return Err(Error::scan(
751                    self.position,
752                    "Expected version after %YAML directive".to_string(),
753                ));
754            };
755
756            // Expect '.'
757            if self.current_char != Some('.') {
758                return Err(Error::scan(
759                    self.position,
760                    "Expected '.' in YAML version".to_string(),
761                ));
762            }
763            self.advance();
764
765            // Parse minor version
766            let minor = if let Some(ch) = self.current_char {
767                if ch.is_ascii_digit() {
768                    let digit = ch.to_digit(10).unwrap() as u8;
769                    self.advance();
770                    digit
771                } else {
772                    return Err(Error::scan(
773                        self.position,
774                        "Expected minor version number after '.'".to_string(),
775                    ));
776                }
777            } else {
778                return Err(Error::scan(
779                    self.position,
780                    "Expected minor version number".to_string(),
781                ));
782            };
783
784            Ok(Some(Token::new(
785                TokenType::YamlDirective(major, minor),
786                start_pos,
787                self.position,
788            )))
789        } else {
790            // Not a YAML directive, reset position
791            self.position = saved_position;
792            // Properly reset current_char based on saved position
793            self.current_char = self
794                .char_indices
795                .iter()
796                .find(|(i, _)| *i == saved_position.index)
797                .map(|(_, ch)| *ch);
798            // Reset the current_char_index
799            self.current_char_index = self
800                .char_indices
801                .iter()
802                .position(|(i, _)| *i == saved_position.index)
803                .unwrap_or(0);
804            Ok(None)
805        }
806    }
807
808    /// Scan TAG directive (%TAG)
809    fn scan_tag_directive(&mut self) -> Result<Option<Token>> {
810        if self.current_char != Some('%') {
811            return Ok(None);
812        }
813
814        let start_pos = self.position;
815        let saved_position = self.position;
816        self.advance(); // Skip '%'
817
818        // Check for "TAG"
819        if self.current_char == Some('T')
820            && self.peek_char(1) == Some('A')
821            && self.peek_char(2) == Some('G')
822            && self.peek_char(3).map_or(false, |c| c.is_whitespace())
823        {
824            self.advance(); // T
825            self.advance(); // A
826            self.advance(); // G
827
828            // Skip whitespace
829            self.skip_whitespace();
830
831            // Parse handle (e.g., "!" or "!!")
832            let handle = self.scan_tag_handle()?;
833
834            // Skip whitespace
835            self.skip_whitespace();
836
837            // Parse prefix (URI)
838            let prefix = self.scan_tag_prefix()?;
839
840            Ok(Some(Token::new(
841                TokenType::TagDirective(handle, prefix),
842                start_pos,
843                self.position,
844            )))
845        } else {
846            // Reset position if not a TAG directive
847            self.position = saved_position;
848            // Properly reset current_char based on saved position
849            self.current_char = self
850                .char_indices
851                .iter()
852                .find(|(i, _)| *i == saved_position.index)
853                .map(|(_, ch)| *ch);
854            // Reset the current_char_index
855            self.current_char_index = self
856                .char_indices
857                .iter()
858                .position(|(i, _)| *i == saved_position.index)
859                .unwrap_or(0);
860            Ok(None)
861        }
862    }
863
864    /// Scan a tag handle for TAG directive
865    fn scan_tag_handle(&mut self) -> Result<String> {
866        let mut handle = String::new();
867
868        if self.current_char != Some('!') {
869            return Err(Error::scan(
870                self.position,
871                "Expected '!' at start of tag handle".to_string(),
872            ));
873        }
874
875        handle.push('!');
876        self.advance();
877
878        // Handle can be "!" or "!!" or "!name!"
879        if self.current_char == Some('!') {
880            // Secondary handle "!!"
881            handle.push('!');
882            self.advance();
883        } else if self.current_char.map_or(false, |c| c.is_alphanumeric()) {
884            // Named handle like "!name!"
885            while let Some(ch) = self.current_char {
886                if ch.is_alphanumeric() || ch == '-' || ch == '_' {
887                    handle.push(ch);
888                    self.advance();
889                } else if ch == '!' {
890                    handle.push(ch);
891                    self.advance();
892                    break;
893                } else {
894                    break;
895                }
896            }
897        }
898        // else just "!" primary handle
899
900        Ok(handle)
901    }
902
903    /// Scan a tag prefix (URI) for TAG directive
904    fn scan_tag_prefix(&mut self) -> Result<String> {
905        let mut prefix = String::new();
906
907        // Read until end of line or comment
908        while let Some(ch) = self.current_char {
909            if ch == '\n' || ch == '\r' || ch == '#' {
910                break;
911            }
912            if ch.is_whitespace() && prefix.is_empty() {
913                self.advance();
914                continue;
915            }
916            if ch.is_whitespace() && !prefix.is_empty() {
917                // Trailing whitespace, we're done
918                break;
919            }
920            prefix.push(ch);
921            self.advance();
922        }
923
924        if prefix.is_empty() {
925            return Err(Error::scan(
926                self.position,
927                "Expected tag prefix after tag handle".to_string(),
928            ));
929        }
930
931        Ok(prefix.trim().to_string())
932    }
933
934    /// Check if current position might be a directive
935    fn is_directive(&self) -> bool {
936        self.current_char == Some('%') && self.position.column == 1
937    }
938
939    /// Scan document end marker (...)
940    fn scan_document_end(&mut self) -> Result<Option<Token>> {
941        if self.current_char == Some('.')
942            && self.peek_char(1) == Some('.')
943            && self.peek_char(2) == Some('.')
944            && self.peek_char(3).map_or(true, |c| c.is_whitespace())
945        {
946            let start_pos = self.position;
947            self.advance(); // .
948            self.advance(); // .
949            self.advance(); // .
950
951            Ok(Some(Token::new(
952                TokenType::DocumentEnd,
953                start_pos,
954                self.position,
955            )))
956        } else {
957            Ok(None)
958        }
959    }
960
961    /// Scan a comment token
962    fn scan_comment(&mut self) -> Result<Token> {
963        let start_pos = self.position;
964        let mut comment_text = String::new();
965
966        // Skip the '#' character
967        if self.current_char == Some('#') {
968            self.advance();
969        }
970
971        // Collect the comment text
972        while let Some(ch) = self.current_char {
973            if ch == '\n' || ch == '\r' {
974                break;
975            }
976            comment_text.push(ch);
977            self.advance();
978        }
979
980        // Trim leading whitespace from comment text
981        let comment_text = comment_text.trim_start().to_string();
982
983        Ok(Token::new(
984            TokenType::Comment(comment_text),
985            start_pos,
986            self.position,
987        ))
988    }
989
990    /// Process a line and generate appropriate tokens
991    #[allow(clippy::cognitive_complexity)]
992    fn process_line(&mut self) -> Result<()> {
993        // Check for directives at start of line
994        if self.position.column == 1 && self.current_char == Some('%') {
995            // Try to scan YAML directive
996            if let Some(token) = self.scan_yaml_directive()? {
997                self.tokens.push(token);
998                return Ok(());
999            }
1000
1001            // Try to scan TAG directive
1002            if let Some(token) = self.scan_tag_directive()? {
1003                self.tokens.push(token);
1004                return Ok(());
1005            }
1006
1007            // If not a recognized directive, treat as error
1008            if self.current_char == Some('%') {
1009                return Err(Error::scan(self.position, "Unknown directive".to_string()));
1010            }
1011        }
1012
1013        // Check for document markers at start of line
1014        if self.position.column == 1 {
1015            // Check for document start marker
1016            if let Some(token) = self.scan_document_start()? {
1017                self.tokens.push(token);
1018                return Ok(());
1019            }
1020
1021            // Check for document end marker
1022            if let Some(token) = self.scan_document_end()? {
1023                self.tokens.push(token);
1024                return Ok(());
1025            }
1026        }
1027
1028        // Handle indentation at start of line
1029        if self.position.column == 1 {
1030            self.handle_indentation()?;
1031        }
1032
1033        // Skip empty lines and comments
1034        self.skip_whitespace();
1035
1036        match self.current_char {
1037            None => return Ok(()),
1038            Some('#') => {
1039                if self.preserve_comments {
1040                    // Create a comment token
1041                    let comment_token = self.scan_comment()?;
1042                    self.tokens.push(comment_token);
1043                } else {
1044                    // Skip comment lines
1045                    while let Some(ch) = self.current_char {
1046                        if ch == '\n' || ch == '\r' {
1047                            break;
1048                        }
1049                        self.advance();
1050                    }
1051                }
1052                return Ok(());
1053            }
1054            Some('\n' | '\r') => {
1055                self.advance();
1056                return Ok(());
1057            }
1058            _ => {}
1059        }
1060
1061        // Process tokens on this line
1062        while let Some(ch) = self.current_char {
1063            match ch {
1064                '\n' | '\r' => break,
1065                ' ' | '\t' => {
1066                    self.skip_whitespace();
1067                }
1068                '#' => {
1069                    if self.preserve_comments {
1070                        // Create a comment token
1071                        let comment_token = self.scan_comment()?;
1072                        self.tokens.push(comment_token);
1073                    } else {
1074                        // Skip rest of line (comment)
1075                        while let Some(ch) = self.current_char {
1076                            if ch == '\n' || ch == '\r' {
1077                                break;
1078                            }
1079                            self.advance();
1080                        }
1081                    }
1082                    break;
1083                }
1084
1085                // Flow indicators
1086                '[' => {
1087                    let pos = self.position;
1088                    self.advance();
1089                    self.flow_level += 1;
1090                    // Check depth limit
1091                    self.resource_tracker
1092                        .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
1093                    self.tokens
1094                        .push(Token::new(TokenType::FlowSequenceStart, pos, self.position));
1095                }
1096                ']' => {
1097                    let pos = self.position;
1098                    self.advance();
1099                    if self.flow_level > 0 {
1100                        self.flow_level -= 1;
1101                    }
1102                    self.tokens
1103                        .push(Token::new(TokenType::FlowSequenceEnd, pos, self.position));
1104                }
1105                '{' => {
1106                    let pos = self.position;
1107                    self.advance();
1108                    self.flow_level += 1;
1109                    // Check depth limit
1110                    self.resource_tracker
1111                        .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
1112                    self.tokens
1113                        .push(Token::new(TokenType::FlowMappingStart, pos, self.position));
1114                }
1115                '}' => {
1116                    let pos = self.position;
1117                    self.advance();
1118                    if self.flow_level > 0 {
1119                        self.flow_level -= 1;
1120                    }
1121                    self.tokens
1122                        .push(Token::new(TokenType::FlowMappingEnd, pos, self.position));
1123                }
1124                ',' => {
1125                    let pos = self.position;
1126                    self.advance();
1127                    self.tokens
1128                        .push(Token::new(TokenType::FlowEntry, pos, self.position));
1129                }
1130
1131                // Key-value separator
1132                ':' => {
1133                    let pos = self.position;
1134                    self.advance();
1135                    self.tokens
1136                        .push(Token::new(TokenType::Value, pos, self.position));
1137                }
1138
1139                // Explicit key marker
1140                '?' if self.flow_level == 0
1141                    && (self.peek_char(1).map_or(true, |c| c.is_whitespace())
1142                        || self.peek_char(1).is_none()) =>
1143                {
1144                    let pos = self.position;
1145                    self.advance();
1146                    self.tokens
1147                        .push(Token::new(TokenType::Key, pos, self.position));
1148                }
1149                '?' if self.flow_level > 0
1150                    && (self
1151                        .peek_char(1)
1152                        .map_or(true, |c| c.is_whitespace() || ",:]}".contains(c))
1153                        || self.peek_char(1).is_none()) =>
1154                {
1155                    let pos = self.position;
1156                    self.advance();
1157                    self.tokens
1158                        .push(Token::new(TokenType::Key, pos, self.position));
1159                }
1160
1161                // Block entry
1162                '-' if self.flow_level == 0
1163                    && (self.peek_char(1).map_or(true, |c| c.is_whitespace())
1164                        || self.peek_char(1).is_none()) =>
1165                {
1166                    let pos = self.position;
1167                    self.advance();
1168
1169                    // Check if we need to start a new block sequence
1170                    let last_indent = *self.indent_stack.last().unwrap();
1171
1172                    if self.current_indent > last_indent {
1173                        // Deeper indentation - start new nested sequence
1174                        self.indent_stack.push(self.current_indent);
1175                        // Check depth limit
1176                        self.resource_tracker
1177                            .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
1178                        self.tokens
1179                            .push(Token::simple(TokenType::BlockSequenceStart, pos));
1180                    } else if self.current_indent >= last_indent {
1181                        // Same or root level - check if we need to start a sequence
1182                        // We need BlockSequenceStart if we haven't started a sequence yet at this document level
1183                        let has_active_sequence = self
1184                            .tokens
1185                            .iter()
1186                            .rev()
1187                            .take_while(|t| {
1188                                !matches!(
1189                                    t.token_type,
1190                                    TokenType::StreamStart
1191                                        | TokenType::DocumentStart
1192                                        | TokenType::DocumentEnd
1193                                )
1194                            })
1195                            .any(|t| matches!(t.token_type, TokenType::BlockSequenceStart));
1196
1197                        if !has_active_sequence {
1198                            // Check depth limit
1199                            self.resource_tracker.check_depth(
1200                                &self.limits,
1201                                self.flow_level + self.indent_stack.len(),
1202                            )?;
1203                            self.tokens
1204                                .push(Token::simple(TokenType::BlockSequenceStart, pos));
1205                        }
1206                    }
1207
1208                    self.tokens
1209                        .push(Token::new(TokenType::BlockEntry, pos, self.position));
1210
1211                    // After emitting BlockEntry, check if the next token is another dash (nested sequence)
1212                    self.skip_whitespace();
1213                    if self.current_char == Some('-')
1214                        && self.peek_char(1).map_or(true, |c| c.is_whitespace())
1215                    {
1216                        // We have a nested sequence on the same line!
1217                        // Track this as an inline sequence
1218                        self.inline_sequence_depth += 1;
1219                        // Also push to indent_stack to track proper nesting
1220                        self.indent_stack.push(self.position.column);
1221                        // Check depth limit
1222                        self.resource_tracker
1223                            .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
1224                        self.tokens
1225                            .push(Token::simple(TokenType::BlockSequenceStart, self.position));
1226                        // Continue processing - the next iteration will handle the nested dash
1227                    }
1228                }
1229
1230                // Quoted strings
1231                '"' => {
1232                    let token = self.scan_quoted_string('"')?;
1233                    self.tokens.push(token);
1234                }
1235                '\'' => {
1236                    let token = self.scan_quoted_string('\'')?;
1237                    self.tokens.push(token);
1238                }
1239
1240                // Document markers (only if not a block entry)
1241                '-' if self.position.column == self.current_indent + 1
1242                    && !self.peek_char(1).map_or(true, |c| c.is_whitespace()) =>
1243                {
1244                    if let Some(token) = self.scan_document_start()? {
1245                        self.tokens.push(token);
1246                    } else if self.is_plain_scalar_start() {
1247                        let token = self.scan_plain_scalar()?;
1248                        self.tokens.push(token);
1249                    }
1250                }
1251                '.' if self.position.column == self.current_indent + 1 => {
1252                    if let Some(token) = self.scan_document_end()? {
1253                        self.tokens.push(token);
1254                    } else if self.is_plain_scalar_start() {
1255                        let token = self.scan_plain_scalar()?;
1256                        self.tokens.push(token);
1257                    }
1258                }
1259
1260                // Numbers or plain scalars starting with -
1261                _ if ch.is_ascii_digit()
1262                    || (ch == '-' && self.peek_char(1).map_or(false, |c| c.is_ascii_digit())) =>
1263                {
1264                    let token = self.scan_number()?;
1265                    self.tokens.push(token);
1266                }
1267
1268                // Anchors and aliases
1269                '&' => {
1270                    let token = self.scan_anchor()?;
1271                    self.tokens.push(token);
1272                }
1273                '*' => {
1274                    let token = self.scan_alias()?;
1275                    self.tokens.push(token);
1276                }
1277
1278                // Block scalars
1279                '|' => {
1280                    let token = self.scan_literal_block_scalar()?;
1281                    self.tokens.push(token);
1282                }
1283                '>' => {
1284                    let token = self.scan_folded_block_scalar()?;
1285                    self.tokens.push(token);
1286                }
1287
1288                // Tags
1289                '!' => {
1290                    let token = self.scan_tag()?;
1291                    self.tokens.push(token);
1292                }
1293
1294                // Plain scalars
1295                _ if self.is_plain_scalar_start() => {
1296                    // Look ahead to see if this is a mapping key
1297                    if self.flow_level == 0 {
1298                        let should_start_mapping = self.check_for_mapping_ahead();
1299                        if should_start_mapping {
1300                            let last_indent = *self.indent_stack.last().unwrap();
1301
1302                            // Check if we should start a new mapping
1303                            // Start a mapping if:
1304                            // 1. No mapping is active at this indentation level, OR
1305                            // 2. We're at a deeper indentation level (nested mapping)
1306                            let should_start_new_mapping = if self.current_indent > last_indent {
1307                                // Deeper indentation - start nested mapping
1308                                true
1309                            } else if self.current_indent == last_indent {
1310                                // Same indentation - check if there's an active mapping at this level
1311                                // We need to carefully track mapping contexts across BlockEnd tokens
1312                                let has_active_mapping_at_this_level =
1313                                    self.check_active_mapping_at_level(self.current_indent);
1314                                !has_active_mapping_at_this_level
1315                            } else {
1316                                // Shallower indentation - should have been handled by handle_indentation
1317                                false
1318                            };
1319
1320                            if should_start_new_mapping {
1321                                // Start mapping before processing the key
1322                                self.indent_stack.push(self.current_indent);
1323                                // Check depth limit
1324                                self.resource_tracker.check_depth(
1325                                    &self.limits,
1326                                    self.flow_level + self.indent_stack.len(),
1327                                )?;
1328                                self.tokens.push(Token::simple(
1329                                    TokenType::BlockMappingStart,
1330                                    self.position,
1331                                ));
1332                            }
1333                        }
1334                    }
1335
1336                    let token = self.scan_plain_scalar()?;
1337                    self.tokens.push(token);
1338                }
1339
1340                _ => {
1341                    let context = ErrorContext::from_input(&self.input, &self.position, 2)
1342                        .with_suggestion("Check for valid YAML syntax characters".to_string());
1343                    return Err(Error::invalid_character_with_context(
1344                        self.position,
1345                        ch,
1346                        "YAML document",
1347                        context,
1348                    ));
1349                }
1350            }
1351        }
1352
1353        // After processing the line, close any inline sequences
1354        while self.inline_sequence_depth > 0 {
1355            self.inline_sequence_depth -= 1;
1356            // Also pop from indent_stack
1357            if self.indent_stack.len() > 1 {
1358                self.indent_stack.pop();
1359            }
1360            self.tokens
1361                .push(Token::simple(TokenType::BlockEnd, self.position));
1362        }
1363
1364        Ok(())
1365    }
1366
1367    /// Scan the next token lazily
1368    fn scan_next_token(&mut self) -> Result<()> {
1369        if self.done {
1370            return Ok(());
1371        }
1372
1373        // Add stream start token if this is the beginning
1374        if self.tokens.is_empty() {
1375            self.tokens
1376                .push(Token::simple(TokenType::StreamStart, self.position));
1377            return Ok(());
1378        }
1379
1380        // Check if we're at the end of input
1381        if self.current_char.is_none() {
1382            if !self
1383                .tokens
1384                .iter()
1385                .any(|t| matches!(t.token_type, TokenType::StreamEnd))
1386            {
1387                self.tokens
1388                    .push(Token::simple(TokenType::StreamEnd, self.position));
1389            }
1390            self.done = true;
1391            return Ok(());
1392        }
1393
1394        // For now, fall back to scanning all tokens at once for the lazy scanner
1395        // This is a simplified implementation - a full streaming parser would
1396        // need more sophisticated state management
1397        let tokens_before = self.tokens.len();
1398        self.scan_all_tokens()?;
1399
1400        // Mark as done after scanning all tokens
1401        if self.tokens.len() == tokens_before {
1402            self.done = true;
1403        }
1404
1405        Ok(())
1406    }
1407
1408    /// Pre-scan all tokens (simplified approach for basic implementation)
1409    fn scan_all_tokens(&mut self) -> Result<()> {
1410        // Only add StreamStart if we don't have it yet
1411        if !self
1412            .tokens
1413            .iter()
1414            .any(|t| matches!(t.token_type, TokenType::StreamStart))
1415        {
1416            self.tokens
1417                .push(Token::simple(TokenType::StreamStart, self.position));
1418        }
1419
1420        while self.current_char.is_some() {
1421            self.process_line()?;
1422
1423            // Advance past newlines
1424            while let Some(ch) = self.current_char {
1425                if ch == '\n' || ch == '\r' {
1426                    self.advance();
1427                } else {
1428                    break;
1429                }
1430            }
1431        }
1432
1433        // Close any remaining blocks
1434        while self.indent_stack.len() > 1 {
1435            self.indent_stack.pop();
1436            self.tokens
1437                .push(Token::simple(TokenType::BlockEnd, self.position));
1438        }
1439
1440        self.tokens
1441            .push(Token::simple(TokenType::StreamEnd, self.position));
1442        self.done = true;
1443        Ok(())
1444    }
1445
1446    /// Peek at a character at the given offset (can be negative)
1447    fn peek_char(&self, offset: isize) -> Option<char> {
1448        if offset >= 0 {
1449            let target_index = self.current_char_index + offset as usize;
1450            if target_index < self.char_cache.len() {
1451                Some(self.char_cache[target_index])
1452            } else {
1453                None
1454            }
1455        } else {
1456            let offset_magnitude = (-offset) as usize;
1457            if self.current_char_index >= offset_magnitude {
1458                Some(self.char_cache[self.current_char_index - offset_magnitude])
1459            } else {
1460                None
1461            }
1462        }
1463    }
1464
1465    /// Scan an anchor token (&name)
1466    fn scan_anchor(&mut self) -> Result<Token> {
1467        let start_pos = self.position;
1468        self.advance(); // Skip '&'
1469
1470        let name = self.scan_identifier()?;
1471        if name.is_empty() {
1472            let context = ErrorContext::from_input(&self.input, &self.position, 2).with_suggestion(
1473                "Provide a valid anchor name after &, e.g., &anchor_name".to_string(),
1474            );
1475            return Err(Error::scan_with_context(
1476                self.position,
1477                "Anchor name cannot be empty",
1478                context,
1479            ));
1480        }
1481
1482        // Track anchor for resource limits
1483        self.resource_tracker.add_anchor(&self.limits)?;
1484
1485        Ok(Token::new(
1486            TokenType::Anchor(name),
1487            start_pos,
1488            self.position,
1489        ))
1490    }
1491
1492    /// Scan an alias token (*name)
1493    fn scan_alias(&mut self) -> Result<Token> {
1494        let start_pos = self.position;
1495        self.advance(); // Skip '*'
1496
1497        let name = self.scan_identifier()?;
1498        if name.is_empty() {
1499            let context = ErrorContext::from_input(&self.input, &self.position, 2).with_suggestion(
1500                "Provide a valid alias name after *, e.g., *alias_name".to_string(),
1501            );
1502            return Err(Error::scan_with_context(
1503                self.position,
1504                "Alias name cannot be empty",
1505                context,
1506            ));
1507        }
1508
1509        Ok(Token::new(TokenType::Alias(name), start_pos, self.position))
1510    }
1511
1512    /// Scan an identifier (used for anchor and alias names)
1513    fn scan_identifier(&mut self) -> Result<String> {
1514        let mut identifier = String::new();
1515
1516        while let Some(ch) = self.current_char {
1517            if ch.is_alphanumeric() || ch == '_' || ch == '-' {
1518                identifier.push(ch);
1519                self.advance();
1520            } else {
1521                break;
1522            }
1523        }
1524
1525        Ok(identifier)
1526    }
1527
1528    /// Scan a tag token (!tag or !!tag or !<verbatim>)
1529    fn scan_tag(&mut self) -> Result<Token> {
1530        let start_pos = self.position;
1531        self.advance(); // Skip first '!'
1532
1533        let mut tag = String::from("!");
1534
1535        // Check for verbatim tag format: !<tag>
1536        if self.current_char == Some('<') {
1537            tag.push('<');
1538            self.advance(); // Skip '<'
1539
1540            // Scan until closing '>'
1541            while let Some(ch) = self.current_char {
1542                if ch == '>' {
1543                    tag.push(ch);
1544                    self.advance();
1545                    break;
1546                } else if ch.is_control() || ch.is_whitespace() {
1547                    return Err(Error::scan(
1548                        self.position,
1549                        "Invalid character in verbatim tag".to_string(),
1550                    ));
1551                }
1552                tag.push(ch);
1553                self.advance();
1554            }
1555        } else {
1556            // Check for secondary tag handle: !!
1557            if self.current_char == Some('!') {
1558                tag.push('!');
1559                self.advance(); // Skip second '!'
1560            }
1561
1562            // Scan tag name/suffix
1563            while let Some(ch) = self.current_char {
1564                if ch.is_alphanumeric() || "-./_:".contains(ch) {
1565                    tag.push(ch);
1566                    self.advance();
1567                } else {
1568                    break;
1569                }
1570            }
1571        }
1572
1573        Ok(Token::new(TokenType::Tag(tag), start_pos, self.position))
1574    }
1575
1576    /// Scan a literal block scalar (|)
1577    fn scan_literal_block_scalar(&mut self) -> Result<Token> {
1578        let start_pos = self.position;
1579        self.advance(); // Skip '|'
1580
1581        // Parse block scalar header (indicators like +, -, explicit indent)
1582        let (keep_trailing, explicit_indent) = self.scan_block_scalar_header()?;
1583
1584        // Skip to next line
1585        self.skip_to_next_line()?;
1586
1587        // Determine indentation
1588        let base_indent = self.current_indent;
1589        let content_indent = if let Some(explicit) = explicit_indent {
1590            base_indent + explicit
1591        } else {
1592            // Find the first non-empty content line to determine indentation
1593            self.find_block_scalar_indent(base_indent)?
1594        };
1595
1596        // Collect the literal block content
1597        let content = self.collect_literal_block_content(content_indent, keep_trailing)?;
1598
1599        Ok(Token::new(
1600            TokenType::BlockScalarLiteral(content),
1601            start_pos,
1602            self.position,
1603        ))
1604    }
1605
1606    /// Scan a folded block scalar (>)
1607    fn scan_folded_block_scalar(&mut self) -> Result<Token> {
1608        let start_pos = self.position;
1609        self.advance(); // Skip '>'
1610
1611        // Parse block scalar header (indicators like +, -, explicit indent)
1612        let (keep_trailing, explicit_indent) = self.scan_block_scalar_header()?;
1613
1614        // Skip to next line
1615        self.skip_to_next_line()?;
1616
1617        // Determine indentation
1618        let base_indent = self.current_indent;
1619        let content_indent = if let Some(explicit) = explicit_indent {
1620            base_indent + explicit
1621        } else {
1622            // Find the first non-empty content line to determine indentation
1623            self.find_block_scalar_indent(base_indent)?
1624        };
1625
1626        // Collect the folded block content
1627        let content = self.collect_folded_block_content(content_indent, keep_trailing)?;
1628
1629        Ok(Token::new(
1630            TokenType::BlockScalarFolded(content),
1631            start_pos,
1632            self.position,
1633        ))
1634    }
1635
1636    /// Parse block scalar header indicators (+, -, and explicit indent)
1637    fn scan_block_scalar_header(&mut self) -> Result<(bool, Option<usize>)> {
1638        let mut keep_trailing = false;
1639        let mut explicit_indent: Option<usize> = None;
1640
1641        // Parse indicators in any order
1642        while let Some(ch) = self.current_char {
1643            match ch {
1644                '+' => {
1645                    keep_trailing = true;
1646                    self.advance();
1647                }
1648                '-' => {
1649                    keep_trailing = false; // Strip trailing newlines
1650                    self.advance();
1651                }
1652                '0'..='9' => {
1653                    let digit = ch.to_digit(10).unwrap() as usize;
1654                    if explicit_indent.is_some() {
1655                        let context = ErrorContext::from_input(&self.input, &self.position, 2)
1656                            .with_suggestion(
1657                                "Use only one indent indicator digit in block scalar".to_string(),
1658                            );
1659                        return Err(Error::scan_with_context(
1660                            self.position,
1661                            "Multiple indent indicators in block scalar",
1662                            context,
1663                        ));
1664                    }
1665                    explicit_indent = Some(digit);
1666                    self.advance();
1667                }
1668                ' ' | '\t' => {
1669                    self.advance(); // Skip whitespace
1670                }
1671                '#' => {
1672                    // Skip comment to end of line
1673                    while let Some(ch) = self.current_char {
1674                        self.advance();
1675                        if ch == '\n' || ch == '\r' {
1676                            break;
1677                        }
1678                    }
1679                    break;
1680                }
1681                '\n' | '\r' => break,
1682                _ => {
1683                    let context = ErrorContext::from_input(&self.input, &self.position, 2)
1684                        .with_suggestion("Use valid block scalar indicators: | (literal), > (folded), + (keep), - (strip), or digit (indent)".to_string());
1685                    return Err(Error::invalid_character_with_context(
1686                        self.position,
1687                        ch,
1688                        "block scalar header",
1689                        context,
1690                    ));
1691                }
1692            }
1693        }
1694
1695        Ok((keep_trailing, explicit_indent))
1696    }
1697
1698    /// Skip whitespace and comments to the next content line
1699    fn skip_to_next_line(&mut self) -> Result<()> {
1700        while let Some(ch) = self.current_char {
1701            match ch {
1702                '\n' | '\r' => {
1703                    self.advance();
1704                    break;
1705                }
1706                ' ' | '\t' => {
1707                    self.advance();
1708                }
1709                _ => break,
1710            }
1711        }
1712        Ok(())
1713    }
1714
1715    /// Find the content indentation for a block scalar
1716    fn find_block_scalar_indent(&mut self, base_indent: usize) -> Result<usize> {
1717        let saved_position = self.position;
1718        let saved_char = self.current_char;
1719        let saved_char_index = self.current_char_index;
1720
1721        let mut content_indent = base_indent + 1; // Default minimum indent
1722
1723        // Look ahead to find the first non-empty line
1724        while let Some(ch) = self.current_char {
1725            self.advance();
1726            if ch == '\n' || ch == '\r' {
1727                let line_indent = self.count_line_indent();
1728
1729                // If this line has content (not just whitespace)
1730                if let Some(line_ch) = self.current_char {
1731                    if line_ch != '\n' && line_ch != '\r' {
1732                        if line_indent > base_indent {
1733                            content_indent = line_indent;
1734                            break;
1735                        }
1736                        // Content must be indented more than the block scalar indicator
1737                        content_indent = base_indent + 1;
1738                        break;
1739                    }
1740                }
1741            }
1742        }
1743
1744        // Restore position
1745        self.position = saved_position;
1746        self.current_char = saved_char;
1747        self.current_char_index = saved_char_index;
1748
1749        Ok(content_indent)
1750    }
1751
1752    /// Count indentation at start of current line
1753    fn count_line_indent(&mut self) -> usize {
1754        let mut indent = 0;
1755        let saved_position = self.position;
1756        let saved_char = self.current_char;
1757        let saved_char_index = self.current_char_index;
1758
1759        while let Some(ch) = self.current_char {
1760            if ch == ' ' {
1761                indent += 1;
1762                self.advance();
1763            } else if ch == '\t' {
1764                indent += 8; // Tab counts as 8 spaces
1765                self.advance();
1766            } else {
1767                break;
1768            }
1769        }
1770
1771        // Restore position
1772        self.position = saved_position;
1773        self.current_char = saved_char;
1774        self.current_char_index = saved_char_index;
1775
1776        indent
1777    }
1778
1779    /// Collect content for a literal block scalar
1780    fn collect_literal_block_content(
1781        &mut self,
1782        content_indent: usize,
1783        _keep_trailing: bool,
1784    ) -> Result<String> {
1785        let mut content = String::new();
1786
1787        while let Some(_) = self.current_char {
1788            let line_indent = self.count_line_indent();
1789
1790            // Skip indentation
1791            for _ in 0..content_indent.min(line_indent) {
1792                if let Some(' ' | '\t') = self.current_char {
1793                    self.advance();
1794                }
1795            }
1796
1797            // Collect line content
1798            let mut line = String::new();
1799            while let Some(ch) = self.current_char {
1800                if ch == '\n' || ch == '\r' {
1801                    self.advance();
1802                    break;
1803                }
1804                line.push(ch);
1805                self.advance();
1806            }
1807
1808            // Check if we've reached the end of the block scalar
1809            if line_indent < content_indent && !line.trim().is_empty() {
1810                // This line is part of the next construct
1811                break;
1812            }
1813
1814            // Add line to content (preserving literal newlines)
1815            content.push_str(&line);
1816            if self.current_char.is_some() {
1817                content.push('\n');
1818            }
1819
1820            // Check for end of input or document boundaries
1821            if self.current_char.is_none() {
1822                break;
1823            }
1824        }
1825
1826        Ok(content)
1827    }
1828
1829    /// Collect content for a folded block scalar
1830    fn collect_folded_block_content(
1831        &mut self,
1832        content_indent: usize,
1833        _keep_trailing: bool,
1834    ) -> Result<String> {
1835        let mut content = String::new();
1836        let mut prev_was_empty = false;
1837        let mut first_line = true;
1838
1839        while let Some(_) = self.current_char {
1840            let line_indent = self.count_line_indent();
1841
1842            // Skip indentation
1843            for _ in 0..content_indent.min(line_indent) {
1844                if let Some(' ' | '\t') = self.current_char {
1845                    self.advance();
1846                }
1847            }
1848
1849            // Collect line content
1850            let mut line = String::new();
1851            while let Some(ch) = self.current_char {
1852                if ch == '\n' || ch == '\r' {
1853                    self.advance();
1854                    break;
1855                }
1856                line.push(ch);
1857                self.advance();
1858            }
1859
1860            // Check if we've reached the end of the block scalar
1861            if line_indent < content_indent && !line.trim().is_empty() {
1862                break;
1863            }
1864
1865            let line_is_empty = line.trim().is_empty();
1866
1867            if line_is_empty {
1868                // Empty lines are preserved as-is
1869                if !first_line && !prev_was_empty {
1870                    content.push('\n');
1871                }
1872                prev_was_empty = true;
1873            } else {
1874                // Non-empty lines are folded (joined with spaces)
1875                if !first_line && !prev_was_empty {
1876                    content.push(' '); // Fold previous line break into space
1877                }
1878                content.push_str(line.trim());
1879                prev_was_empty = false;
1880            }
1881
1882            first_line = false;
1883
1884            if self.current_char.is_none() {
1885                break;
1886            }
1887        }
1888
1889        Ok(content)
1890    }
1891
1892    /// Check if the current position is the start of a mapping key by looking ahead for ':'
1893    fn check_for_mapping_ahead(&self) -> bool {
1894        // Look ahead through the current line for a ':' character
1895        for i in self.current_char_index..self.char_cache.len() {
1896            let ch = self.char_cache[i];
1897            match ch {
1898                ':' => {
1899                    // Found colon, check if it's followed by whitespace or end of line
1900                    let next_char = self.char_cache.get(i + 1).copied();
1901                    return next_char.map_or(true, |c| c.is_whitespace());
1902                }
1903                '\n' | '\r' => break, // End of line, no colon found
1904                _ => {}
1905            }
1906        }
1907        false
1908    }
1909
1910    /// Check if there's an active mapping at the specified indentation level
1911    /// This method properly handles BlockEnd tokens by tracking mapping start/end pairs
1912    fn check_active_mapping_at_level(&self, _target_indent: usize) -> bool {
1913        let mut mapping_depth = 0;
1914        let _current_mapping_indent: Option<usize> = None;
1915
1916        // Walk backwards through tokens to find mapping context
1917        for token in self.tokens.iter().rev() {
1918            match &token.token_type {
1919                TokenType::BlockMappingStart => {
1920                    if mapping_depth == 0 {
1921                        // This is the most recent unmatched mapping start
1922                        // Check if it's at our target indentation level
1923                        // We approximate the indentation based on the indent stack when this token was created
1924                        return true; // Simplified: assume we found an active mapping
1925                    }
1926                    mapping_depth -= 1;
1927                }
1928                TokenType::BlockEnd => {
1929                    mapping_depth += 1;
1930                }
1931                TokenType::StreamStart | TokenType::DocumentStart | TokenType::DocumentEnd => {
1932                    // Stop at document boundaries
1933                    break;
1934                }
1935                _ => {}
1936            }
1937        }
1938
1939        false
1940    }
1941}
1942
1943impl Scanner for BasicScanner {
1944    fn check_token(&self) -> bool {
1945        // For lazy scanning: check if we have cached tokens or can generate more
1946        self.token_index < self.tokens.len() || !self.done
1947    }
1948
1949    fn peek_token(&self) -> Result<Option<&Token>> {
1950        // This is a bit tricky with lazy scanning since peek shouldn't mutate
1951        // For now, return cached token if available
1952        Ok(self.tokens.get(self.token_index))
1953    }
1954
1955    fn get_token(&mut self) -> Result<Option<Token>> {
1956        // If we need more tokens and haven't finished, scan next token
1957        if self.token_index >= self.tokens.len() && !self.done {
1958            self.scan_next_token()?;
1959        }
1960
1961        if self.token_index < self.tokens.len() {
1962            let token = self.tokens[self.token_index].clone();
1963            self.token_index += 1;
1964            Ok(Some(token))
1965        } else {
1966            Ok(None)
1967        }
1968    }
1969
1970    fn reset(&mut self) {
1971        self.token_index = 0;
1972        self.position = Position::start();
1973        self.tokens.clear();
1974        self.done = false;
1975        self.current_char = self.input.chars().next();
1976        self.indent_stack = vec![0];
1977        self.current_indent = 0;
1978        self.flow_level = 0;
1979        self.detected_indent_style = None;
1980        self.indent_samples.clear();
1981        self.previous_indent_level = 0;
1982        self.current_char_index = 0;
1983        self.current_char = self.char_cache.first().copied();
1984    }
1985
1986    fn position(&self) -> Position {
1987        self.position
1988    }
1989
1990    fn input(&self) -> &str {
1991        &self.input
1992    }
1993}
1994
1995#[cfg(test)]
1996mod tests {
1997    use super::*;
1998
1999    #[test]
2000    fn test_basic_tokenization() {
2001        let mut scanner = BasicScanner::new("42".to_string());
2002
2003        assert!(scanner.check_token());
2004
2005        // StreamStart
2006        let token = scanner.get_token().unwrap().unwrap();
2007        assert!(matches!(token.token_type, TokenType::StreamStart));
2008
2009        // Number
2010        let token = scanner.get_token().unwrap().unwrap();
2011        if let TokenType::Scalar(value, _) = token.token_type {
2012            assert_eq!(value, "42");
2013        } else {
2014            panic!("Expected scalar token");
2015        }
2016
2017        // StreamEnd
2018        let token = scanner.get_token().unwrap().unwrap();
2019        assert!(matches!(token.token_type, TokenType::StreamEnd));
2020    }
2021
2022    #[test]
2023    fn test_flow_sequence() {
2024        let mut scanner = BasicScanner::new("[1, 2, 3]".to_string());
2025
2026        // StreamStart
2027        scanner.get_token().unwrap();
2028
2029        // [
2030        let token = scanner.get_token().unwrap().unwrap();
2031        assert!(matches!(token.token_type, TokenType::FlowSequenceStart));
2032
2033        // 1
2034        let token = scanner.get_token().unwrap().unwrap();
2035        if let TokenType::Scalar(value, _) = token.token_type {
2036            assert_eq!(value, "1");
2037        }
2038
2039        // ,
2040        let token = scanner.get_token().unwrap().unwrap();
2041        assert!(matches!(token.token_type, TokenType::FlowEntry));
2042    }
2043
2044    #[test]
2045    fn test_quoted_strings() {
2046        let mut scanner = BasicScanner::new(r#""hello world""#.to_string());
2047
2048        // StreamStart
2049        scanner.get_token().unwrap();
2050
2051        // Quoted string
2052        let token = scanner.get_token().unwrap().unwrap();
2053        if let TokenType::Scalar(value, _) = token.token_type {
2054            assert_eq!(value, "hello world");
2055        } else {
2056            panic!("Expected scalar token");
2057        }
2058    }
2059
2060    #[test]
2061    fn test_comment_handling() {
2062        let input = r"
2063# Full line comment
2064key: value  # End of line comment
2065# Another comment
2066data: test
2067";
2068        let mut scanner = BasicScanner::new(input.to_string());
2069
2070        let mut tokens = Vec::new();
2071        while let Ok(Some(token)) = scanner.get_token() {
2072            tokens.push(token);
2073        }
2074
2075        // Should only contain YAML structure tokens, no comment tokens
2076        let scalar_values: Vec<String> = tokens
2077            .iter()
2078            .filter_map(|t| match &t.token_type {
2079                TokenType::Scalar(s, _) => Some(s.clone()),
2080                _ => None,
2081            })
2082            .collect();
2083
2084        assert_eq!(scalar_values, vec!["key", "value", "data", "test"]);
2085
2086        // Should not contain any comment tokens
2087        assert!(!tokens
2088            .iter()
2089            .any(|t| matches!(t.token_type, TokenType::Comment(_))));
2090    }
2091
2092    #[test]
2093    fn test_hash_in_strings() {
2094        let input = r#"
2095string1: "This has a # character"
2096string2: 'Also has # character'
2097normal: value # This is a comment
2098"#;
2099        let mut scanner = BasicScanner::new(input.to_string());
2100
2101        let mut scalar_values = Vec::new();
2102        while let Ok(Some(token)) = scanner.get_token() {
2103            if let TokenType::Scalar(value, _) = token.token_type {
2104                scalar_values.push(value);
2105            }
2106        }
2107
2108        assert!(scalar_values.contains(&"This has a # character".to_string()));
2109        assert!(scalar_values.contains(&"Also has # character".to_string()));
2110        assert!(scalar_values.contains(&"value".to_string()));
2111        assert!(!scalar_values
2112            .iter()
2113            .any(|s| s.contains("This is a comment")));
2114    }
2115
2116    #[test]
2117    fn test_escape_sequences() {
2118        // Test standard C-style escapes
2119        let test_cases = vec![
2120            (r#""Line 1\nLine 2""#, "Line 1\nLine 2"),
2121            (r#""Col1\tCol2""#, "Col1\tCol2"),
2122            (r#""First\rSecond""#, "First\rSecond"),
2123            (r#""Path\\to\\file""#, "Path\\to\\file"),
2124            (r#""He said \"Hello\"""#, "He said \"Hello\""),
2125            (r"'Don\'t do that'", "Don't do that"),
2126        ];
2127
2128        for (input, expected) in test_cases {
2129            let mut scanner = BasicScanner::new(input.to_string());
2130            scanner.get_token().unwrap(); // Skip StreamStart
2131
2132            if let Ok(Some(token)) = scanner.get_token() {
2133                if let TokenType::Scalar(value, _) = token.token_type {
2134                    assert_eq!(value, expected, "Failed for input: {}", input);
2135                } else {
2136                    panic!("Expected scalar token for input: {}", input);
2137                }
2138            } else {
2139                panic!("Failed to get token for input: {}", input);
2140            }
2141        }
2142    }
2143
2144    #[test]
2145    fn test_extended_yaml_escapes() {
2146        // Test additional YAML escape sequences
2147        let test_cases = vec![
2148            (r#""\0""#, "\0"),   // null character
2149            (r#""\a""#, "\x07"), // bell
2150            (r#""\b""#, "\x08"), // backspace
2151            (r#""\f""#, "\x0C"), // form feed
2152            (r#""\v""#, "\x0B"), // vertical tab
2153            (r#""\e""#, "\x1B"), // escape
2154            (r#""\ ""#, " "),    // literal space
2155            (r#""\/"#, "/"),     // literal forward slash
2156        ];
2157
2158        for (input, expected) in test_cases {
2159            let mut scanner = BasicScanner::new(input.to_string());
2160            scanner.get_token().unwrap(); // Skip StreamStart
2161
2162            if let Ok(Some(token)) = scanner.get_token() {
2163                if let TokenType::Scalar(value, _) = token.token_type {
2164                    assert_eq!(value, expected, "Failed for input: {}", input);
2165                } else {
2166                    panic!("Expected scalar token for input: {}", input);
2167                }
2168            } else {
2169                panic!("Failed to get token for input: {}", input);
2170            }
2171        }
2172    }
2173
2174    #[test]
2175    fn test_unknown_escape_sequences() {
2176        // Test that unknown escape sequences are preserved literally
2177        let input = r#""\z\q\8""#;
2178        let expected = "\\z\\q\\8"; // Should preserve backslashes for unknown escapes
2179
2180        let mut scanner = BasicScanner::new(input.to_string());
2181        scanner.get_token().unwrap(); // Skip StreamStart
2182
2183        if let Ok(Some(token)) = scanner.get_token() {
2184            if let TokenType::Scalar(value, _) = token.token_type {
2185                assert_eq!(value, expected);
2186            } else {
2187                panic!("Expected scalar token");
2188            }
2189        } else {
2190            panic!("Failed to get token");
2191        }
2192    }
2193}
rust_yaml/scanner/mod.rs

rust_yaml/scanner/
mod.rs