yaml_edit/
lex.rs

1//! Lexer for YAML files.
2
3/// Whitespace and formatting validation errors
4#[derive(Debug, Clone, PartialEq, Eq)]
5pub struct WhitespaceError {
6    /// The error message
7    pub message: String,
8    /// The byte range where the error occurred
9    pub range: std::ops::Range<usize>,
10    /// Error category
11    pub category: WhitespaceErrorCategory,
12}
13
14/// Categories of whitespace errors
15#[derive(Debug, Clone, PartialEq, Eq)]
16pub enum WhitespaceErrorCategory {
17    /// Tab character used for indentation (forbidden in YAML)
18    TabIndentation,
19    /// Line too long according to configured limit
20    LineTooLong,
21    /// Mixed line ending styles
22    MixedLineEndings,
23    /// Invalid scalar indentation
24    InvalidIndentation,
25}
26
27/// YAML Concrete Syntax Tree (CST) node types.
28///
29/// This enum defines all possible node types in the YAML syntax tree, representing both
30/// lexical tokens (from the lexer) and semantic nodes (created by the parser).
31///
32/// # Tree Hierarchy
33///
34/// The YAML syntax tree follows this general structure:
35///
36/// ```text
37/// ROOT
38/// ├── DOCUMENT*
39/// │   ├── DIRECTIVE* (optional, e.g., %YAML 1.2)
40/// │   ├── DOC_START? (optional ---)
41/// │   ├── MAPPING | SEQUENCE | SCALAR | TAGGED_NODE
42/// │   └── DOC_END? (optional ...)
43/// └── WHITESPACE | NEWLINE | COMMENT (between documents)
44///
45/// MAPPING
46/// ├── MAPPING_ENTRY*
47/// │   ├── KEY
48/// │   │   └── SCALAR | SEQUENCE | MAPPING (YAML 1.2 allows complex keys)
49/// │   ├── COLON
50/// │   ├── WHITESPACE?
51/// │   └── VALUE
52/// │       └── SCALAR | SEQUENCE | MAPPING | TAGGED_NODE
53/// ├── NEWLINE
54/// ├── INDENT
55/// └── COMMENT?
56///
57/// SEQUENCE  
58/// ├── SEQUENCE_ENTRY*
59/// │   ├── DASH
60/// │   ├── WHITESPACE?
61/// │   └── SCALAR | SEQUENCE | MAPPING | TAGGED_NODE
62/// ├── NEWLINE
63/// ├── INDENT
64/// └── COMMENT?
65///
66/// SCALAR
67/// └── STRING | INT | FLOAT | BOOL | NULL
68///
69/// TAGGED_NODE
70/// ├── TAG (e.g., !!str, !custom)
71/// ├── WHITESPACE?
72/// └── SCALAR | MAPPING | SEQUENCE
73/// ```
74///
75/// # Node Categories
76///
77/// ## Structural Nodes (created by parser)
78/// - **ROOT**: Top-level container for the entire document
79/// - **DOCUMENT**: A single YAML document (separated by --- or ...)
80/// - **MAPPING**: Key-value pairs `{key: value}` or block style
81/// - **SEQUENCE**: Lists `[item1, item2]` or block style with `-`
82/// - **SCALAR**: Leaf values (strings, numbers, booleans, null)
83/// - **TAGGED_NODE**: Values with explicit type tags `!!str "hello"`
84///
85/// ## Container Nodes (created by parser)
86/// - **MAPPING_ENTRY**: A single key-value pair within a mapping
87/// - **SEQUENCE_ENTRY**: A single item within a sequence
88/// - **KEY**: The key part of a key-value pair (can contain complex types)
89/// - **VALUE**: The value part of a key-value pair
90///
91/// ## Lexical Tokens (from lexer)
92/// - **Punctuation**: COLON, DASH, COMMA, etc.
93/// - **Brackets**: LEFT_BRACKET, RIGHT_BRACKET, LEFT_BRACE, RIGHT_BRACE
94/// - **Literals**: STRING, INT, FLOAT, BOOL, NULL
95/// - **YAML-specific**: TAG, ANCHOR, REFERENCE, MERGE_KEY
96/// - **Document markers**: DOC_START (---), DOC_END (...)
97/// - **Formatting**: WHITESPACE, NEWLINE, INDENT, COMMENT
98///
99/// ## Special Cases
100///
101/// ### Complex Keys (YAML 1.2.2)
102/// Keys can be sequences or mappings, not just scalars:
103/// ```yaml
104/// [1, 2]: value        # Sequence key
105/// {a: b}: value        # Mapping key
106/// ```
107///
108/// ### Tagged Values
109/// Values can have explicit type information:
110/// ```yaml
111/// number: !!int "123"  # Force string "123" to be treated as integer
112/// binary: !!binary |   # Base64 encoded binary data
113///   R0lGODlhDAAMAIQ...
114/// ```
115///
116/// ### Block Scalars
117/// Multi-line strings with special parsing rules:
118/// ```yaml
119/// literal: |           # PIPE indicates literal scalar
120///   Line 1
121///   Line 2
122/// folded: >            # GREATER indicates folded scalar  
123///   Long text that
124///   gets folded
125/// ```
126///
127/// The tree preserves all original formatting, comments, and whitespace,
128/// enabling lossless round-trip parsing and precise source location tracking.
129#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
130#[repr(u16)]
131#[allow(non_camel_case_types, clippy::upper_case_acronyms)]
132pub enum SyntaxKind {
133    // Structural
134    /// Root node of the syntax tree
135    ROOT = 0,
136    /// A YAML document
137    DOCUMENT,
138    /// A YAML sequence (list)
139    SEQUENCE,
140    /// A YAML mapping (key-value pairs)
141    MAPPING,
142    /// A YAML scalar value
143    SCALAR,
144    /// A YAML alias reference (e.g., *anchor_name)
145    ALIAS,
146    /// A YAML tagged scalar (tag + value)
147    TAGGED_NODE,
148    /// Parse error marker
149    ERROR,
150
151    // Tokens
152    /// Dash character '-'
153    DASH,
154    /// Plus character '+'
155    PLUS,
156    /// Colon character ':'
157    COLON,
158    /// Question mark '?'
159    QUESTION,
160    /// Left bracket '['
161    LEFT_BRACKET,
162    /// Right bracket ']'
163    RIGHT_BRACKET,
164    /// Left brace '{'
165    LEFT_BRACE,
166    /// Right brace '}'
167    RIGHT_BRACE,
168    /// Comma ','
169    COMMA,
170    /// Pipe '|'
171    PIPE,
172    /// Greater than '>'
173    GREATER,
174    /// Ampersand '&'
175    AMPERSAND,
176    /// Asterisk '*'
177    ASTERISK,
178    /// Exclamation '!'
179    EXCLAMATION,
180    /// Percent '%'
181    PERCENT,
182    /// At symbol '@'
183    AT,
184    /// Backtick '`'
185    BACKTICK,
186    /// Double quote '"'
187    QUOTE,
188    /// Single quote "'"
189    SINGLE_QUOTE,
190
191    // Document markers
192    /// Document start marker '---'
193    DOC_START,
194    /// Document end marker '...'
195    DOC_END,
196
197    // Parser-generated semantic nodes
198    /// A mapping key (created by parser from context)
199    KEY,
200    /// A value in key-value pair (created by parser from context)
201    VALUE,
202    /// A complete mapping entry (key-value pair with associated tokens)
203    MAPPING_ENTRY,
204    /// A sequence entry (item with associated tokens)
205    SEQUENCE_ENTRY,
206
207    // Content tokens (from lexer)
208    /// String literal (quoted or unquoted identifier)
209    STRING,
210    /// Unterminated string (missing closing quote)
211    UNTERMINATED_STRING,
212    /// Integer literal
213    INT,
214    /// Float literal
215    FLOAT,
216    /// Boolean literal (true/false)
217    BOOL,
218    /// Null literal
219    NULL,
220    /// YAML tag like '!tag'
221    TAG,
222    /// YAML anchor like '&anchor'
223    ANCHOR,
224    /// YAML reference like '*reference'
225    REFERENCE,
226    /// YAML merge key '<<'
227    MERGE_KEY,
228    /// YAML directive like '%YAML 1.2'
229    DIRECTIVE,
230
231    // Whitespace and formatting
232    /// Spaces and tabs
233    WHITESPACE,
234    /// Newline characters
235    NEWLINE,
236    /// Leading whitespace that determines structure
237    INDENT,
238    /// Comments starting with '#'
239    COMMENT,
240
241    // Special
242    /// UTF-8 Byte Order Mark (BOM) - U+FEFF at start of file
243    BOM,
244    /// End of file marker
245    EOF,
246}
247
248impl From<SyntaxKind> for rowan::SyntaxKind {
249    fn from(kind: SyntaxKind) -> Self {
250        Self(kind as u16)
251    }
252}
253
254/// Helper to read a scalar value starting from current position
255fn read_scalar_from<'a>(
256    chars: &mut std::iter::Peekable<std::str::CharIndices<'a>>,
257    input: &'a str,
258    start_idx: usize,
259    exclude_chars: &str,
260) -> &'a str {
261    let mut end_idx = start_idx;
262    while let Some((idx, ch)) = chars.peek() {
263        if ch.is_whitespace() || is_yaml_special_except(*ch, exclude_chars) {
264            break;
265        }
266        end_idx = *idx + ch.len_utf8();
267        chars.next();
268    }
269    &input[start_idx..end_idx]
270}
271
272/// Tokenize YAML input with whitespace validation
273pub fn lex(input: &str) -> Vec<(SyntaxKind, &str)> {
274    let (tokens, _) = lex_with_validation(input);
275    tokens
276}
277
278/// Configuration for whitespace and formatting validation
279pub struct ValidationConfig {
280    /// Maximum line length (None = no limit)
281    pub max_line_length: Option<usize>,
282    /// Whether to enforce consistent line endings
283    pub enforce_consistent_line_endings: bool,
284}
285
286impl Default for ValidationConfig {
287    fn default() -> Self {
288        Self {
289            max_line_length: Some(120), // Default to 120 characters
290            enforce_consistent_line_endings: true,
291        }
292    }
293}
294
295/// Tokenize YAML input with whitespace and formatting validation
296pub fn lex_with_validation(input: &str) -> (Vec<(SyntaxKind, &str)>, Vec<WhitespaceError>) {
297    lex_with_validation_config(input, &ValidationConfig::default())
298}
299
300/// Tokenize YAML input with custom validation configuration
301pub fn lex_with_validation_config<'a>(
302    input: &'a str,
303    config: &ValidationConfig,
304) -> (Vec<(SyntaxKind, &'a str)>, Vec<WhitespaceError>) {
305    use SyntaxKind::*;
306
307    let mut tokens = Vec::with_capacity(input.len() / 8); // Pre-allocate based on estimate
308    let mut chars = input.char_indices().peekable();
309    let mut whitespace_errors = Vec::new();
310    let bytes = input.as_bytes();
311
312    // Track line information for validation
313    let mut current_line_start = 0;
314    let mut detected_line_ending: Option<&str> = None;
315
316    // Track flow collection depth for context-aware tokenization
317    let mut flow_depth: u32 = 0;
318
319    // Handle UTF-8 BOM (U+FEFF) at the start of the file
320    // Per YAML spec, BOM is allowed and should be processed transparently
321    if let Some((0, '\u{FEFF}')) = chars.peek() {
322        chars.next(); // Consume the BOM
323        tokens.push((BOM, "\u{FEFF}"));
324    }
325
326    while let Some((start_idx, ch)) = chars.next() {
327        let token_start = start_idx;
328
329        match ch {
330            // Context-aware hyphen handling
331            '-' => {
332                if let Some((_, '-')) = chars.peek() {
333                    chars.next(); // consume second -
334                    if let Some((_, '-')) = chars.peek() {
335                        chars.next(); // consume third -
336                        tokens.push((DOC_START, &input[token_start..start_idx + 3]));
337                    } else {
338                        // Just two dashes, treat as sequence marker followed by dash
339                        tokens.push((DASH, &input[token_start..start_idx + 1]));
340                        tokens.push((DASH, &input[start_idx + 1..start_idx + 2]));
341                    }
342                } else {
343                    // Check if this hyphen should be treated as a sequence marker
344                    // It's a sequence marker if:
345                    // 1. It's at the beginning of a line (after optional indentation)
346                    // 2. OR it follows a value context (after ? or : plus whitespace)
347                    // AND it's followed by whitespace or end of input
348
349                    // Check if preceded only by whitespace from start of line
350                    // Look for either \n or \r as line breaks
351                    let line_start_pos = input[..token_start]
352                        .rfind(['\n', '\r'])
353                        .map(|pos| pos + 1)
354                        .unwrap_or(0);
355                    let before_dash = &input[line_start_pos..token_start];
356                    let only_whitespace_before = before_dash.chars().all(|c| c == ' ' || c == '\t');
357
358                    // Check if the previous non-whitespace token was ? or :
359                    // indicating a value context where sequences are allowed
360                    let after_value_indicator = tokens
361                        .iter()
362                        .rev()
363                        .find(|(kind, _)| !matches!(kind, WHITESPACE | INDENT))
364                        .is_some_and(|(kind, _)| matches!(kind, QUESTION | COLON));
365
366                    // Check if followed by whitespace or end of input
367                    let followed_by_whitespace_or_end = chars
368                        .peek()
369                        .map_or(true, |(_, next_ch)| next_ch.is_whitespace());
370
371                    let is_sequence_marker = (only_whitespace_before || after_value_indicator)
372                        && followed_by_whitespace_or_end;
373
374                    if is_sequence_marker {
375                        tokens.push((DASH, &input[token_start..start_idx + 1]));
376                    } else {
377                        // This hyphen is part of a scalar value
378                        let text = read_scalar_from(&mut chars, input, start_idx + 1, "-");
379                        let full_text = &input[token_start..token_start + 1 + text.len()];
380                        let token_kind = classify_scalar(full_text);
381                        tokens.push((token_kind, full_text));
382                    }
383                }
384            }
385            '+' => tokens.push((PLUS, &input[token_start..start_idx + 1])),
386            ':' => {
387                // In flow collections, colon is always a structural character
388                // In block context, colon only indicates mapping if followed by whitespace
389                if flow_depth > 0 {
390                    // Inside flow collection: always tokenize as COLON
391                    tokens.push((COLON, &input[token_start..start_idx + 1]));
392                } else if let Some((_, next_ch)) = chars.peek() {
393                    if next_ch.is_whitespace() {
394                        // This is a mapping indicator in block context
395                        tokens.push((COLON, &input[token_start..start_idx + 1]));
396                    } else {
397                        // This colon is part of a plain scalar (e.g., URLs, timestamps)
398                        // Continue reading the scalar
399                        let mut end_idx = start_idx + 1;
400                        while let Some((idx, next_ch)) = chars.peek() {
401                            if next_ch.is_whitespace() {
402                                break;
403                            }
404                            // Check for special chars, but exclude colon since we're already in a scalar with colon
405                            if is_yaml_special_except(*next_ch, ":") {
406                                break;
407                            }
408                            end_idx = *idx + next_ch.len_utf8();
409                            chars.next();
410                        }
411                        let text = &input[token_start..end_idx];
412                        tokens.push((classify_scalar(text), text));
413                    }
414                } else {
415                    // Colon at end of input
416                    tokens.push((COLON, &input[token_start..start_idx + 1]));
417                }
418            }
419            '?' => tokens.push((QUESTION, &input[token_start..start_idx + 1])),
420            '[' => {
421                flow_depth += 1;
422                tokens.push((LEFT_BRACKET, &input[token_start..start_idx + 1]));
423            }
424            ']' => {
425                flow_depth = flow_depth.saturating_sub(1);
426                tokens.push((RIGHT_BRACKET, &input[token_start..start_idx + 1]));
427            }
428            '{' => {
429                flow_depth += 1;
430                tokens.push((LEFT_BRACE, &input[token_start..start_idx + 1]));
431            }
432            '}' => {
433                flow_depth = flow_depth.saturating_sub(1);
434                tokens.push((RIGHT_BRACE, &input[token_start..start_idx + 1]));
435            }
436            ',' => tokens.push((COMMA, &input[token_start..start_idx + 1])),
437            '|' => tokens.push((PIPE, &input[token_start..start_idx + 1])),
438            '>' => tokens.push((GREATER, &input[token_start..start_idx + 1])),
439            '<' => {
440                // Check if this is a merge key '<<'
441                if let Some((_, '<')) = chars.peek() {
442                    chars.next(); // consume second <
443                    tokens.push((MERGE_KEY, &input[token_start..start_idx + 2]));
444                } else {
445                    // Single '<' is not a special YAML character, treat as scalar
446                    let mut end_idx = start_idx + 1;
447                    while let Some((idx, ch)) = chars.peek() {
448                        if ch.is_whitespace() || is_yaml_special(*ch) {
449                            break;
450                        }
451                        end_idx = *idx + ch.len_utf8();
452                        chars.next();
453                    }
454                    let text = &input[token_start..end_idx];
455                    let token_kind = classify_scalar(text);
456                    tokens.push((token_kind, text));
457                }
458            }
459            '&' => {
460                // Check if this is an anchor definition
461                let name = read_scalar_from(&mut chars, input, start_idx + 1, "");
462                if !name.is_empty() {
463                    tokens.push((ANCHOR, &input[token_start..start_idx + 1 + name.len()]));
464                } else {
465                    tokens.push((AMPERSAND, &input[token_start..start_idx + 1]));
466                }
467            }
468            '*' => {
469                // Check if this is an alias reference
470                let name = read_scalar_from(&mut chars, input, start_idx + 1, "");
471                if !name.is_empty() {
472                    tokens.push((REFERENCE, &input[token_start..start_idx + 1 + name.len()]));
473                } else {
474                    tokens.push((ASTERISK, &input[token_start..start_idx + 1]));
475                }
476            }
477            '"' => {
478                // Read entire double-quoted string
479                let mut end_idx = start_idx + 1;
480                let mut escaped = false;
481                let mut found_closing = false;
482
483                while let Some((idx, ch)) = chars.peek() {
484                    let current_idx = *idx;
485                    let current_ch = *ch;
486
487                    if escaped {
488                        escaped = false;
489                        end_idx = current_idx + current_ch.len_utf8();
490                        chars.next();
491                        continue;
492                    }
493
494                    if current_ch == '\\' {
495                        escaped = true;
496                        end_idx = current_idx + current_ch.len_utf8();
497                        chars.next();
498                    } else if current_ch == '"' {
499                        end_idx = current_idx + current_ch.len_utf8();
500                        chars.next();
501                        found_closing = true;
502                        break;
503                    } else {
504                        end_idx = current_idx + current_ch.len_utf8();
505                        chars.next();
506                    }
507                }
508
509                if found_closing {
510                    tokens.push((STRING, &input[token_start..end_idx]));
511                } else {
512                    // Unterminated string - add UNTERMINATED_STRING token
513                    tokens.push((UNTERMINATED_STRING, &input[token_start..end_idx]));
514                }
515            }
516            '\'' => {
517                // Read entire single-quoted string
518                let mut end_idx = start_idx + 1;
519                let mut found_closing = false;
520
521                while let Some((idx, ch)) = chars.peek() {
522                    let current_idx = *idx;
523                    let current_ch = *ch;
524
525                    if current_ch == '\'' {
526                        // Check for escaped quote ('')
527                        end_idx = current_idx + current_ch.len_utf8();
528                        chars.next();
529                        if let Some((next_idx, '\'')) = chars.peek() {
530                            // Double quote - consume both and continue
531                            end_idx = *next_idx + 1;
532                            chars.next();
533                        } else {
534                            // Single quote - end of string
535                            found_closing = true;
536                            break;
537                        }
538                    } else {
539                        end_idx = current_idx + current_ch.len_utf8();
540                        chars.next();
541                    }
542                }
543
544                if found_closing {
545                    tokens.push((STRING, &input[token_start..end_idx]));
546                } else {
547                    // Unterminated string - add UNTERMINATED_STRING token
548                    tokens.push((UNTERMINATED_STRING, &input[token_start..end_idx]));
549                }
550            }
551
552            // Document end
553            '.' => {
554                // Check for three dots (document end marker)
555                if chars.peek() == Some(&(start_idx + 1, '.')) {
556                    chars.next(); // consume second .
557                    if chars.peek() == Some(&(start_idx + 2, '.')) {
558                        chars.next(); // consume third .
559                        tokens.push((DOC_END, &input[token_start..start_idx + 3]));
560                    } else {
561                        // Two dots - continue as scalar
562                        let rest = read_scalar_from(&mut chars, input, start_idx + 2, "");
563                        let text = &input[token_start..start_idx + 2 + rest.len()];
564                        let token_kind = classify_scalar(text);
565                        tokens.push((token_kind, text));
566                    }
567                } else {
568                    // Single dot - part of scalar
569                    let rest = read_scalar_from(&mut chars, input, start_idx + 1, "");
570                    let text = &input[token_start..start_idx + 1 + rest.len()];
571                    let token_kind = classify_scalar(text);
572                    tokens.push((token_kind, text));
573                }
574            }
575
576            // Comments
577            '#' => {
578                let mut end_idx = start_idx + 1;
579                while let Some((idx, ch)) = chars.peek() {
580                    if *ch == '\n' || *ch == '\r' {
581                        break;
582                    }
583                    end_idx = *idx + ch.len_utf8();
584                    chars.next();
585                }
586                tokens.push((COMMENT, &input[token_start..end_idx]));
587            }
588
589            // Tags
590            '!' => {
591                // Handle tag indicators - both ! and !!
592                let mut end_idx = start_idx + 1;
593
594                // Check for double exclamation (global tag)
595                if let Some((_, '!')) = chars.peek() {
596                    chars.next(); // consume the second !
597                    end_idx = start_idx + 2;
598                }
599
600                // Read the tag name after the ! or !!
601                while let Some((idx, ch)) = chars.peek() {
602                    if ch.is_whitespace() || is_yaml_special(*ch) {
603                        break;
604                    }
605                    end_idx = *idx + ch.len_utf8();
606                    chars.next();
607                }
608
609                tokens.push((TAG, &input[token_start..end_idx]));
610            }
611
612            '%' => {
613                // In flow collections, % is part of plain scalars, not a directive
614                if flow_depth > 0 {
615                    // Treat as part of a plain scalar in flow context
616                    let mut end_idx = start_idx + 1;
617                    while let Some((idx, next_ch)) = chars.peek() {
618                        if next_ch.is_whitespace() {
619                            break;
620                        }
621                        if is_yaml_special_except(*next_ch, "%") {
622                            break;
623                        }
624                        end_idx = *idx + next_ch.len_utf8();
625                        chars.next();
626                    }
627                    let text = &input[token_start..end_idx];
628                    tokens.push((classify_scalar(text), text));
629                } else {
630                    // In block context, % starts a directive
631                    let mut end_idx = start_idx + 1;
632                    while let Some((idx, ch)) = chars.peek() {
633                        if *ch == '\n' || *ch == '\r' {
634                            break;
635                        }
636                        end_idx = *idx + ch.len_utf8();
637                        chars.next();
638                    }
639                    tokens.push((DIRECTIVE, &input[token_start..end_idx]));
640                }
641            }
642
643            // Newlines
644            '\n' => {
645                // Check line length before processing newline
646                if let Some(max_len) = config.max_line_length {
647                    let line_length = start_idx - current_line_start;
648                    if line_length > max_len {
649                        whitespace_errors.push(WhitespaceError {
650                            message: format!(
651                                "Line too long ({} > {} characters)",
652                                line_length, max_len
653                            ),
654                            range: current_line_start..start_idx,
655                            category: WhitespaceErrorCategory::LineTooLong,
656                        });
657                    }
658                }
659
660                // Validate line ending consistency
661                let line_ending = "\n";
662                if config.enforce_consistent_line_endings {
663                    if let Some(detected) = detected_line_ending {
664                        if detected != line_ending {
665                            whitespace_errors.push(WhitespaceError {
666                                message: "Inconsistent line endings detected".to_string(),
667                                range: token_start..start_idx + 1,
668                                category: WhitespaceErrorCategory::MixedLineEndings,
669                            });
670                        }
671                    } else {
672                        detected_line_ending = Some(line_ending);
673                    }
674                }
675
676                tokens.push((NEWLINE, &input[token_start..start_idx + 1]));
677                current_line_start = start_idx + 1;
678            }
679            '\r' => {
680                // Check line length before processing newline
681                if let Some(max_len) = config.max_line_length {
682                    let line_length = start_idx - current_line_start;
683                    if line_length > max_len {
684                        whitespace_errors.push(WhitespaceError {
685                            message: format!(
686                                "Line too long ({} > {} characters)",
687                                line_length, max_len
688                            ),
689                            range: current_line_start..start_idx,
690                            category: WhitespaceErrorCategory::LineTooLong,
691                        });
692                    }
693                }
694
695                let (line_ending, end_pos) = if let Some((_, '\n')) = chars.peek() {
696                    chars.next();
697                    ("\r\n", start_idx + 2)
698                } else {
699                    ("\r", start_idx + 1)
700                };
701
702                // Validate line ending consistency
703                if config.enforce_consistent_line_endings {
704                    if let Some(detected) = detected_line_ending {
705                        if detected != line_ending {
706                            whitespace_errors.push(WhitespaceError {
707                                message: "Inconsistent line endings detected".to_string(),
708                                range: token_start..end_pos,
709                                category: WhitespaceErrorCategory::MixedLineEndings,
710                            });
711                        }
712                    } else {
713                        detected_line_ending = Some(line_ending);
714                    }
715                }
716
717                tokens.push((NEWLINE, &input[token_start..end_pos]));
718                current_line_start = end_pos;
719            }
720
721            // Whitespace (spaces and tabs)
722            ' ' | '\t' => {
723                let mut end_idx = start_idx + 1;
724                let mut has_tabs = ch == '\t';
725
726                while let Some((idx, ch)) = chars.peek() {
727                    if *ch != ' ' && *ch != '\t' {
728                        break;
729                    }
730                    if *ch == '\t' {
731                        has_tabs = true;
732                    }
733                    end_idx = *idx + 1;
734                    chars.next();
735                }
736
737                // Determine if this is structural indentation
738                // Check for any line break: \n, \r\n (already consumed \n), or \r alone
739                let is_indentation = token_start == 0
740                    || (token_start > 0
741                        && (bytes[token_start - 1] == b'\n' || bytes[token_start - 1] == b'\r'));
742
743                if is_indentation {
744                    // Check for tab characters in indentation (forbidden in YAML)
745                    if has_tabs {
746                        whitespace_errors.push(WhitespaceError {
747                            message: "Tab character used for indentation (forbidden in YAML)"
748                                .to_string(),
749                            range: token_start..end_idx,
750                            category: WhitespaceErrorCategory::TabIndentation,
751                        });
752                    }
753                    tokens.push((INDENT, &input[token_start..end_idx]));
754                } else {
755                    tokens.push((WHITESPACE, &input[token_start..end_idx]));
756                }
757            }
758
759            // Everything else is scalar content
760            _ => {
761                let mut end_idx = start_idx + ch.len_utf8();
762
763                // Read the rest of the scalar normally, including embedded hyphens
764                while let Some((idx, next_ch)) = chars.peek() {
765                    if next_ch.is_whitespace() {
766                        break;
767                    }
768
769                    // Check for YAML special characters
770                    // Special handling for colon: only special if followed by whitespace or EOF
771                    if *next_ch == ':' {
772                        // Peek ahead one more to check if colon is followed by whitespace
773                        let next_idx = *idx + next_ch.len_utf8();
774                        if next_idx >= input.len() {
775                            // Colon at EOF - stop here (treat as mapping indicator)
776                            break;
777                        } else if let Some(after) = input[next_idx..].chars().next() {
778                            if after.is_whitespace() {
779                                // Colon followed by whitespace - stop here
780                                break;
781                            }
782                        }
783                        // Colon not followed by whitespace - continue as part of scalar
784                        end_idx = *idx + next_ch.len_utf8();
785                        chars.next();
786                        continue;
787                    }
788
789                    // Check other special characters (excluding hyphen and colon)
790                    if is_yaml_special_except(*next_ch, "-:") {
791                        // In block context, flow indicators do NOT break scalars
792                        if flow_depth == 0 && matches!(*next_ch, '[' | ']' | '{' | '}' | ',') {
793                            // do nothing, let it be part of the scalar
794                        } else {
795                            break;
796                        }
797                    }
798
799                    // Special case: check if hyphen is a sequence marker
800                    if *next_ch == '-' {
801                        // A hyphen is only a sequence marker if it's at line start
802                        // and this scalar is already complete (we're at a word boundary)
803                        let line_start = input[..(*idx)].rfind('\n').map(|p| p + 1).unwrap_or(0);
804                        let before_hyphen = &input[line_start..*idx];
805
806                        // If there's only whitespace before the hyphen, it might be a sequence marker
807                        // Break here to let the main loop handle it
808                        if before_hyphen.chars().all(|c| c == ' ' || c == '\t') && *idx == end_idx {
809                            break;
810                        }
811                    }
812
813                    end_idx = *idx + next_ch.len_utf8();
814                    chars.next();
815                }
816
817                let text = &input[token_start..end_idx];
818                tokens.push((classify_scalar(text), text));
819            }
820        }
821    }
822
823    // Check the final line length if there's no trailing newline
824    if let Some(max_len) = config.max_line_length {
825        let final_line_length = input.len() - current_line_start;
826        if final_line_length > max_len && final_line_length > 0 {
827            whitespace_errors.push(WhitespaceError {
828                message: format!(
829                    "Line too long ({} > {} characters)",
830                    final_line_length, max_len
831                ),
832                range: current_line_start..input.len(),
833                category: WhitespaceErrorCategory::LineTooLong,
834            });
835        }
836    }
837
838    (tokens, whitespace_errors)
839}
840
841/// Classify a scalar token based on its content
842fn classify_scalar(text: &str) -> SyntaxKind {
843    use SyntaxKind::*;
844
845    // Boolean literals
846    match text {
847        "true" | "false" | "True" | "False" | "TRUE" | "FALSE" => return BOOL,
848        "null" | "Null" | "NULL" | "~" => return NULL,
849        _ => {}
850    }
851
852    // Try to parse as integer (handles 0x, 0o, 0b, octal, decimal)
853    if crate::scalar::ScalarValue::parse_integer(text).is_some() {
854        return INT;
855    }
856
857    // YAML special float values (infinity and NaN)
858    // Note: Must check these before general f64 parsing because Rust's parse::<f64>()
859    // accepts "infinity" and "inf" which should only be treated as floats in YAML
860    // when written as ".inf", not as bare "infinity" or "inf"
861    match text {
862        ".inf" | ".Inf" | ".INF" | "+.inf" | "+.Inf" | "+.INF" | "-.inf" | "-.Inf" | "-.INF"
863        | ".nan" | ".NaN" | ".NAN" => return FLOAT,
864        // Rust's parse::<f64>() accepts "infinity" and "inf", but in YAML these
865        // should be treated as strings unless written as ".inf"
866        "infinity" | "inf" | "Infinity" | "Inf" | "INFINITY" | "INF" | "-infinity" | "-inf"
867        | "-Infinity" | "-Inf" | "-INFINITY" | "-INF" | "+infinity" | "+inf" | "+Infinity"
868        | "+Inf" | "+INFINITY" | "+INF" | "nan" | "NaN" | "NAN" => return STRING,
869        _ => {}
870    }
871
872    // Try to parse as float
873    if text.parse::<f64>().is_ok() {
874        return FLOAT;
875    }
876
877    // Everything else is a string
878    STRING
879}
880
881/// Common set of YAML special characters
882const YAML_SPECIAL_CHARS: &str = ":+-?[]{},'|>&*!%\"#";
883
884/// Check if a character has special meaning in YAML
885fn is_yaml_special(ch: char) -> bool {
886    YAML_SPECIAL_CHARS.contains(ch)
887}
888
889/// Check if character is YAML special, with optional exclusions
890fn is_yaml_special_except(ch: char, exclude: &str) -> bool {
891    YAML_SPECIAL_CHARS.contains(ch) && !exclude.contains(ch)
892}
893
894#[cfg(test)]
895mod tests {
896    use super::*;
897
898    #[test]
899    fn test_simple_mapping() {
900        let input = "key: value";
901        let tokens = lex(input);
902
903        assert_eq!(tokens.len(), 4);
904        assert_eq!(tokens[0], (SyntaxKind::STRING, "key"));
905        assert_eq!(tokens[1], (SyntaxKind::COLON, ":"));
906        assert_eq!(tokens[2], (SyntaxKind::WHITESPACE, " "));
907        assert_eq!(tokens[3], (SyntaxKind::STRING, "value"));
908    }
909
910    #[test]
911    fn test_scalar_types() {
912        // Test integer
913        let tokens = lex("age: 42");
914        assert_eq!(tokens[0], (SyntaxKind::STRING, "age"));
915        assert_eq!(tokens[3], (SyntaxKind::INT, "42"));
916
917        // Test float
918        let tokens = lex("pi: 3.14");
919        assert_eq!(tokens[0], (SyntaxKind::STRING, "pi"));
920        assert_eq!(tokens[3], (SyntaxKind::FLOAT, "3.14"));
921
922        // Test boolean true
923        let tokens = lex("enabled: true");
924        assert_eq!(tokens[0], (SyntaxKind::STRING, "enabled"));
925        assert_eq!(tokens[3], (SyntaxKind::BOOL, "true"));
926
927        // Test boolean false
928        let tokens = lex("disabled: false");
929        assert_eq!(tokens[3], (SyntaxKind::BOOL, "false"));
930
931        // Test null
932        let tokens = lex("value: null");
933        assert_eq!(tokens[3], (SyntaxKind::NULL, "null"));
934
935        // Test tilde as null
936        let tokens = lex("value: ~");
937        assert_eq!(tokens[3], (SyntaxKind::NULL, "~"));
938    }
939
940    #[test]
941    fn test_sequences() {
942        let input = "- item1\n- item2";
943        let tokens = lex(input);
944
945        assert_eq!(tokens[0], (SyntaxKind::DASH, "-"));
946        assert_eq!(tokens[1], (SyntaxKind::WHITESPACE, " "));
947        assert_eq!(tokens[2], (SyntaxKind::STRING, "item1"));
948        assert_eq!(tokens[3], (SyntaxKind::NEWLINE, "\n"));
949        assert_eq!(tokens[4], (SyntaxKind::DASH, "-"));
950        assert_eq!(tokens[5], (SyntaxKind::WHITESPACE, " "));
951        assert_eq!(tokens[6], (SyntaxKind::STRING, "item2"));
952    }
953
954    #[test]
955    fn test_hyphen_in_scalars() {
956        // Test hyphens in scalar values should not be treated as sequence markers
957        let input = "Name: example-project";
958        let tokens = lex(input);
959
960        println!("Hyphen test tokens:");
961        for (i, (kind, text)) in tokens.iter().enumerate() {
962            println!("  {}: {:?} = {:?}", i, kind, text);
963        }
964
965        // Should get: STRING("Name"), COLON(":"), WHITESPACE(" "), STRING("example-project")
966        assert_eq!(tokens.len(), 4);
967        assert_eq!(tokens[0], (SyntaxKind::STRING, "Name"));
968        assert_eq!(tokens[1], (SyntaxKind::COLON, ":"));
969        assert_eq!(tokens[2], (SyntaxKind::WHITESPACE, " "));
970        assert_eq!(tokens[3], (SyntaxKind::STRING, "example-project"));
971    }
972
973    #[test]
974    fn test_hyphen_sequence_vs_scalar() {
975        // Test that sequence markers are still recognized correctly
976        let sequence_input = "- example-item";
977        let tokens = lex(sequence_input);
978
979        println!("Sequence hyphen tokens:");
980        for (i, (kind, text)) in tokens.iter().enumerate() {
981            println!("  {}: {:?} = {:?}", i, kind, text);
982        }
983
984        // Should get: DASH("-"), WHITESPACE(" "), STRING("example-item")
985        assert_eq!(tokens[0], (SyntaxKind::DASH, "-"));
986        assert_eq!(tokens[1], (SyntaxKind::WHITESPACE, " "));
987        assert_eq!(tokens[2], (SyntaxKind::STRING, "example-item"));
988
989        // Test scalar with hyphens in different contexts
990        let scalar_input = "package-name: my-awesome-package";
991        let tokens = lex(scalar_input);
992
993        println!("Package hyphen tokens:");
994        for (i, (kind, text)) in tokens.iter().enumerate() {
995            println!("  {}: {:?} = {:?}", i, kind, text);
996        }
997
998        // Should get: STRING("package-name"), COLON(":"), WHITESPACE(" "), STRING("my-awesome-package")
999        assert_eq!(tokens.len(), 4);
1000        assert_eq!(tokens[0], (SyntaxKind::STRING, "package-name"));
1001        assert_eq!(tokens[3], (SyntaxKind::STRING, "my-awesome-package"));
1002    }
1003
1004    #[test]
1005    fn test_flow_style() {
1006        // Flow sequence
1007        let tokens = lex("[1, 2, 3]");
1008        assert_eq!(tokens[0], (SyntaxKind::LEFT_BRACKET, "["));
1009        assert_eq!(tokens[1], (SyntaxKind::INT, "1"));
1010        assert_eq!(tokens[2], (SyntaxKind::COMMA, ","));
1011        assert_eq!(tokens[3], (SyntaxKind::WHITESPACE, " "));
1012        assert_eq!(tokens[4], (SyntaxKind::INT, "2"));
1013        assert_eq!(tokens[5], (SyntaxKind::COMMA, ","));
1014        assert_eq!(tokens[6], (SyntaxKind::WHITESPACE, " "));
1015        assert_eq!(tokens[7], (SyntaxKind::INT, "3"));
1016        assert_eq!(tokens[8], (SyntaxKind::RIGHT_BRACKET, "]"));
1017
1018        // Flow mapping
1019        let tokens = lex("{a: 1, b: 2}");
1020        assert_eq!(tokens[0], (SyntaxKind::LEFT_BRACE, "{"));
1021        assert_eq!(tokens[1], (SyntaxKind::STRING, "a"));
1022        assert_eq!(tokens[2], (SyntaxKind::COLON, ":"));
1023        assert_eq!(tokens[3], (SyntaxKind::WHITESPACE, " "));
1024        assert_eq!(tokens[4], (SyntaxKind::INT, "1"));
1025    }
1026
1027    #[test]
1028    fn test_comments() {
1029        let input = "key: value # this is a comment\n# full line comment";
1030        let tokens = lex(input);
1031
1032        // Find comment tokens
1033        let comments: Vec<_> = tokens
1034            .iter()
1035            .filter(|(kind, _)| *kind == SyntaxKind::COMMENT)
1036            .collect();
1037
1038        assert_eq!(comments.len(), 2);
1039        assert_eq!(comments[0].1, "# this is a comment");
1040        assert_eq!(comments[1].1, "# full line comment");
1041    }
1042
1043    #[test]
1044    fn test_multiline_scalar() {
1045        let input = "key: value\n  continued";
1046        let tokens = lex(input);
1047
1048        // Check for indent token
1049        let indents: Vec<_> = tokens
1050            .iter()
1051            .filter(|(kind, _)| *kind == SyntaxKind::INDENT)
1052            .collect();
1053        assert_eq!(indents.len(), 1);
1054        assert_eq!(indents[0].1, "  ");
1055    }
1056
1057    #[test]
1058    fn test_quoted_strings() {
1059        let input = r#"single: 'quoted'
1060double: "quoted""#;
1061        let tokens = lex(input);
1062
1063        // Find quoted string tokens - after fix, quotes are included in STRING tokens
1064        let quoted_strings: Vec<_> = tokens
1065            .iter()
1066            .filter(|(kind, text)| {
1067                *kind == SyntaxKind::STRING && (text.starts_with('\'') || text.starts_with('"'))
1068            })
1069            .collect();
1070        assert_eq!(quoted_strings.len(), 2); // single and double quoted strings
1071
1072        // Verify content (order depends on which appears first in the source)
1073        let quoted_texts: Vec<&str> = {
1074            let mut v: Vec<&str> = quoted_strings.iter().map(|(_, t)| *t).collect();
1075            v.sort();
1076            v
1077        };
1078        assert_eq!(quoted_texts, ["\"quoted\"", "'quoted'"]);
1079    }
1080
1081    #[test]
1082    fn test_document_markers() {
1083        let input = "---\nkey: value\n...";
1084        let tokens = lex(input);
1085
1086        println!("Document tokens:");
1087        for (i, (kind, text)) in tokens.iter().enumerate() {
1088            println!("  {}: {:?} = {:?}", i, kind, text);
1089        }
1090
1091        // Check for document start and end markers
1092        let doc_start_count = tokens
1093            .iter()
1094            .filter(|(kind, _)| *kind == SyntaxKind::DOC_START)
1095            .count();
1096        let doc_end_count = tokens
1097            .iter()
1098            .filter(|(kind, _)| *kind == SyntaxKind::DOC_END)
1099            .count();
1100        assert_eq!(doc_start_count, 1);
1101        assert_eq!(doc_end_count, 1);
1102    }
1103
1104    #[test]
1105    fn test_empty_input() {
1106        let input = "";
1107        let tokens = lex(input);
1108        println!("Empty input tokens: {:?}", tokens);
1109        assert_eq!(tokens.len(), 0);
1110    }
1111
1112    #[test]
1113    fn test_anchors_and_aliases() {
1114        // Test anchor definition
1115        let input = "key: &anchor_name value";
1116        let tokens = lex(input);
1117        println!("Anchor tokens: {:?}", tokens);
1118
1119        let anchors: Vec<_> = tokens
1120            .iter()
1121            .filter(|(kind, _)| *kind == SyntaxKind::ANCHOR)
1122            .collect();
1123        assert_eq!(anchors.len(), 1);
1124        assert_eq!(anchors[0].1, "&anchor_name");
1125
1126        // Test alias reference
1127        let input = "key: *reference_name";
1128        let tokens = lex(input);
1129        println!("Reference tokens: {:?}", tokens);
1130
1131        let references: Vec<_> = tokens
1132            .iter()
1133            .filter(|(kind, _)| *kind == SyntaxKind::REFERENCE)
1134            .collect();
1135        assert_eq!(references.len(), 1);
1136        assert_eq!(references[0].1, "*reference_name");
1137
1138        // Test bare ampersand and asterisk (should not be treated as anchors/references)
1139        let input = "key: & *";
1140        let tokens = lex(input);
1141
1142        let ampersands: Vec<_> = tokens
1143            .iter()
1144            .filter(|(kind, _)| *kind == SyntaxKind::AMPERSAND)
1145            .collect();
1146        assert_eq!(ampersands.len(), 1);
1147
1148        let asterisks: Vec<_> = tokens
1149            .iter()
1150            .filter(|(kind, _)| *kind == SyntaxKind::ASTERISK)
1151            .collect();
1152        assert_eq!(asterisks.len(), 1);
1153    }
1154
1155    #[test]
1156    fn test_merge_key_token() {
1157        // Test merge key '<<'
1158        let input = "<<: *defaults";
1159        let tokens = lex(input);
1160
1161        let merge_keys: Vec<_> = tokens
1162            .iter()
1163            .filter(|(kind, _)| *kind == SyntaxKind::MERGE_KEY)
1164            .collect();
1165        assert_eq!(merge_keys.len(), 1);
1166        assert_eq!(merge_keys[0].1, "<<");
1167
1168        // Test single '<' is not a merge key
1169        let input2 = "key: < value";
1170        let tokens2 = lex(input2);
1171
1172        let merge_keys2: Vec<_> = tokens2
1173            .iter()
1174            .filter(|(kind, _)| *kind == SyntaxKind::MERGE_KEY)
1175            .collect();
1176        assert_eq!(merge_keys2.len(), 0, "Single < should not be a merge key");
1177    }
1178
1179    #[test]
1180    fn test_plus_token() {
1181        // Test plus as standalone token
1182        let input = "key: |+ value";
1183        let tokens = lex(input);
1184
1185        let plus_tokens: Vec<_> = tokens
1186            .iter()
1187            .filter(|(kind, _)| *kind == SyntaxKind::PLUS)
1188            .collect();
1189        assert_eq!(plus_tokens.len(), 1);
1190        assert_eq!(plus_tokens[0].1, "+");
1191    }
1192
1193    #[test]
1194    fn test_block_scalar_indicators() {
1195        // Test literal with chomping indicators
1196        let input1 = "key: |+ content";
1197        let tokens1 = lex(input1);
1198
1199        assert!(tokens1
1200            .iter()
1201            .any(|(kind, text)| *kind == SyntaxKind::PIPE && *text == "|"));
1202        assert!(tokens1
1203            .iter()
1204            .any(|(kind, text)| *kind == SyntaxKind::PLUS && *text == "+"));
1205
1206        // Test folded with chomping indicators
1207        let input2 = "key: >- content";
1208        let tokens2 = lex(input2);
1209
1210        assert!(tokens2
1211            .iter()
1212            .any(|(kind, text)| *kind == SyntaxKind::GREATER && *text == ">"));
1213        assert!(tokens2
1214            .iter()
1215            .any(|(kind, text)| *kind == SyntaxKind::STRING && *text == "-"));
1216
1217        // Test with explicit indentation
1218        let input3 = "key: |2+ content";
1219        let tokens3 = lex(input3);
1220
1221        assert!(tokens3
1222            .iter()
1223            .any(|(kind, text)| *kind == SyntaxKind::PIPE && *text == "|"));
1224        assert!(tokens3
1225            .iter()
1226            .any(|(kind, text)| *kind == SyntaxKind::INT && *text == "2"));
1227        assert!(tokens3
1228            .iter()
1229            .any(|(kind, text)| *kind == SyntaxKind::PLUS && *text == "+"));
1230    }
1231
1232    #[test]
1233    fn test_special_characters_in_block_content() {
1234        let input = "line with - and + and : characters";
1235        let tokens = lex(input);
1236
1237        // With context-aware hyphen parsing, the standalone hyphen with spaces
1238        // is treated as a string because it's not a sequence marker
1239        assert!(tokens
1240            .iter()
1241            .any(|(kind, text)| *kind == SyntaxKind::STRING && *text == "-"));
1242
1243        // Plus and colon are still tokenized as special characters
1244        assert!(tokens
1245            .iter()
1246            .any(|(kind, text)| *kind == SyntaxKind::PLUS && *text == "+"));
1247        assert!(tokens
1248            .iter()
1249            .any(|(kind, text)| *kind == SyntaxKind::COLON && *text == ":"));
1250
1251        // Should also have the word tokens
1252        assert!(tokens
1253            .iter()
1254            .any(|(kind, text)| *kind == SyntaxKind::STRING && *text == "line"));
1255        assert!(tokens
1256            .iter()
1257            .any(|(kind, text)| *kind == SyntaxKind::STRING && *text == "with"));
1258        assert!(tokens
1259            .iter()
1260            .any(|(kind, text)| *kind == SyntaxKind::STRING && *text == "and"));
1261        assert!(tokens
1262            .iter()
1263            .any(|(kind, text)| *kind == SyntaxKind::STRING && *text == "characters"));
1264    }
1265
1266    #[test]
1267    fn test_token_recognition() {
1268        let input = "key: |2+ \n  content with - and : and > chars\n  more content";
1269        let tokens = lex(input);
1270
1271        // Print tokens for debugging
1272        println!("Comprehensive tokens:");
1273        for (i, (kind, text)) in tokens.iter().enumerate() {
1274            println!("  {}: {:?} = {:?}", i, kind, text);
1275        }
1276
1277        // Verify all expected token kinds are present (input: "key: |2+ \n  content with - and : and > chars\n  more content")
1278        let count = |k: SyntaxKind| tokens.iter().filter(|(kind, _)| *kind == k).count();
1279        // Two colons: one for the mapping ("key:"), one in the value content ("and :")
1280        assert_eq!(count(SyntaxKind::COLON), 2);
1281        assert_eq!(count(SyntaxKind::PIPE), 1); // "|"
1282        assert_eq!(count(SyntaxKind::INT), 1); // "2"
1283        assert_eq!(count(SyntaxKind::PLUS), 1); // "+"
1284        assert_eq!(count(SyntaxKind::GREATER), 1); // ">"
1285        assert_eq!(count(SyntaxKind::NEWLINE), 2); // after "|2+" line and after first content line
1286        assert_eq!(count(SyntaxKind::INDENT), 2); // "  " before each content line
1287                                                  // Multiple STRING tokens: "key", content words, and the hyphen
1288        assert!(count(SyntaxKind::STRING) >= 1, "expected STRING tokens");
1289
1290        // With context-aware hyphen parsing, the hyphen in content is now part of a STRING
1291        assert_eq!(
1292            tokens
1293                .iter()
1294                .filter(|(kind, text)| *kind == SyntaxKind::STRING && *text == "-")
1295                .count(),
1296            1
1297        );
1298    }
1299
1300    #[test]
1301    fn test_dash_handling() {
1302        // Test 1: Document start marker
1303        let input = "---\nkey: value";
1304        let tokens = lex(input);
1305        assert_eq!(tokens[0], (SyntaxKind::DOC_START, "---"));
1306
1307        // Test 2: Document with just three dashes
1308        let input = "---";
1309        let tokens = lex(input);
1310        assert_eq!(tokens.len(), 1);
1311        assert_eq!(tokens[0], (SyntaxKind::DOC_START, "---"));
1312
1313        // Test 3: Two dashes (not a document marker)
1314        let input = "--";
1315        let tokens = lex(input);
1316        assert_eq!(tokens.len(), 2);
1317        assert_eq!(tokens[0], (SyntaxKind::DASH, "-"));
1318        assert_eq!(tokens[1], (SyntaxKind::DASH, "-"));
1319
1320        // Test 4: Four dashes
1321        let input = "----";
1322        let tokens = lex(input);
1323        assert_eq!(tokens[0], (SyntaxKind::DOC_START, "---"));
1324        assert_eq!(tokens[1], (SyntaxKind::STRING, "-"));
1325    }
1326
1327    #[test]
1328    fn test_dash_in_different_scalar_contexts() {
1329        // Test kebab-case identifiers
1330        let input = "package-name: my-awesome-package-v2";
1331        let tokens = lex(input);
1332        assert_eq!(tokens[0], (SyntaxKind::STRING, "package-name"));
1333        assert_eq!(tokens[1], (SyntaxKind::COLON, ":"));
1334        assert_eq!(tokens[2], (SyntaxKind::WHITESPACE, " "));
1335        assert_eq!(tokens[3], (SyntaxKind::STRING, "my-awesome-package-v2"));
1336
1337        // Test UUID-like strings
1338        let input = "id: 123e4567-e89b-12d3-a456-426614174000";
1339        let tokens = lex(input);
1340        assert_eq!(tokens[0], (SyntaxKind::STRING, "id"));
1341        assert_eq!(
1342            tokens[3],
1343            (SyntaxKind::STRING, "123e4567-e89b-12d3-a456-426614174000")
1344        );
1345
1346        // Test command-line arguments
1347        let input = "args: --verbose --log-level=debug";
1348        let tokens = lex(input);
1349        // Double dashes are tokenized as two DASH tokens
1350        assert_eq!(
1351            tokens
1352                .windows(3)
1353                .filter(|w| {
1354                    w[0] == (SyntaxKind::DASH, "-")
1355                        && w[1] == (SyntaxKind::DASH, "-")
1356                        && w[2] == (SyntaxKind::STRING, "verbose")
1357                })
1358                .count(),
1359            1
1360        );
1361
1362        // Test negative numbers
1363        let input = "temperature: -40";
1364        let tokens = lex(input);
1365        // Negative numbers are tokenized as INT tokens
1366        assert_eq!(
1367            tokens
1368                .iter()
1369                .filter(|(kind, text)| *kind == SyntaxKind::INT && *text == "-40")
1370                .count(),
1371            1
1372        );
1373
1374        // Test ranges
1375        let input = "range: 1-10";
1376        let tokens = lex(input);
1377        assert_eq!(
1378            tokens
1379                .iter()
1380                .filter(|(kind, text)| *kind == SyntaxKind::STRING && *text == "1-10")
1381                .count(),
1382            1
1383        );
1384    }
1385
1386    #[test]
1387    fn test_sequence_markers_with_indentation() {
1388        // Test basic sequence
1389        let input = "- item1\n- item2";
1390        let tokens = lex(input);
1391        assert_eq!(tokens[0], (SyntaxKind::DASH, "-"));
1392        assert_eq!(tokens[1], (SyntaxKind::WHITESPACE, " "));
1393        assert_eq!(tokens[2], (SyntaxKind::STRING, "item1"));
1394
1395        // Test indented sequence
1396        let input = "  - item1\n  - item2";
1397        let tokens = lex(input);
1398        assert_eq!(tokens[0], (SyntaxKind::INDENT, "  "));
1399        assert_eq!(tokens[1], (SyntaxKind::DASH, "-"));
1400
1401        // Test nested sequences
1402        let input = "- item1\n  - nested1\n  - nested2\n- item2";
1403        let tokens = lex(input);
1404        let dash_tokens: Vec<_> = tokens
1405            .iter()
1406            .filter(|(kind, _)| *kind == SyntaxKind::DASH)
1407            .collect();
1408        assert_eq!(dash_tokens.len(), 4); // Four sequence markers
1409
1410        // Test sequence with hyphenated values
1411        let input = "- first-item\n- second-item";
1412        let tokens = lex(input);
1413        assert_eq!(tokens[0], (SyntaxKind::DASH, "-"));
1414        assert_eq!(tokens[2], (SyntaxKind::STRING, "first-item"));
1415        assert_eq!(tokens[4], (SyntaxKind::DASH, "-"));
1416        assert_eq!(tokens[6], (SyntaxKind::STRING, "second-item"));
1417    }
1418
1419    #[test]
1420    fn test_dash_after_colon() {
1421        // Test hyphen immediately after colon
1422        // According to YAML spec, "key:-value" is a single plain scalar
1423        // because the colon is not followed by whitespace
1424        let input = "key:-value";
1425        let tokens = lex(input);
1426        assert_eq!(tokens[0], (SyntaxKind::STRING, "key:-value"));
1427
1428        // Test with space - this creates a mapping
1429        let input = "key: -value";
1430        let tokens = lex(input);
1431        assert_eq!(tokens[0], (SyntaxKind::STRING, "key"));
1432        assert_eq!(tokens[1], (SyntaxKind::COLON, ":"));
1433        assert_eq!(tokens[2], (SyntaxKind::WHITESPACE, " "));
1434        assert_eq!(tokens[3], (SyntaxKind::STRING, "-value"));
1435    }
1436
1437    #[test]
1438    fn test_yaml_spec_compliant_colon_handling() {
1439        // Test that colons are handled according to YAML spec:
1440        // - Colon followed by whitespace indicates mapping
1441        // - Colon not followed by whitespace is part of plain scalar
1442
1443        // URLs should be single scalars (no space after colon)
1444        let input = "http://example.com:8080/path";
1445        let tokens = lex(input);
1446        assert_eq!(tokens.len(), 1);
1447        assert_eq!(
1448            tokens[0],
1449            (SyntaxKind::STRING, "http://example.com:8080/path")
1450        );
1451
1452        // Timestamps should be single scalars
1453        let input = "2024:12:31:23:59:59";
1454        let tokens = lex(input);
1455        assert_eq!(tokens.len(), 1);
1456        assert_eq!(tokens[0], (SyntaxKind::STRING, "2024:12:31:23:59:59"));
1457
1458        // Key-value pairs need space after colon
1459        let input = "key: value";
1460        let tokens = lex(input);
1461        assert_eq!(tokens[0], (SyntaxKind::STRING, "key"));
1462        assert_eq!(tokens[1], (SyntaxKind::COLON, ":"));
1463        assert_eq!(tokens[2], (SyntaxKind::WHITESPACE, " "));
1464        assert_eq!(tokens[3], (SyntaxKind::STRING, "value"));
1465
1466        // Without space, it's a single scalar
1467        let input = "key:value";
1468        let tokens = lex(input);
1469        assert_eq!(tokens.len(), 1);
1470        assert_eq!(tokens[0], (SyntaxKind::STRING, "key:value"));
1471
1472        // Multiple colons without spaces
1473        let input = "a:b:c:d";
1474        let tokens = lex(input);
1475        assert_eq!(tokens.len(), 1);
1476        assert_eq!(tokens[0], (SyntaxKind::STRING, "a:b:c:d"));
1477    }
1478
1479    #[test]
1480    fn test_block_scalar_with_chomping() {
1481        // Helper to count tokens by kind
1482        let count_kind = |toks: &[(SyntaxKind, &str)], k: SyntaxKind| {
1483            toks.iter().filter(|(kind, _)| *kind == k).count()
1484        };
1485
1486        // Test literal block scalar with strip chomping
1487        let input = "text: |-\n  content";
1488        let tokens = lex(input);
1489        assert_eq!(count_kind(&tokens, SyntaxKind::PIPE), 1);
1490        assert_eq!(
1491            tokens
1492                .iter()
1493                .filter(|(kind, text)| *kind == SyntaxKind::STRING && *text == "-")
1494                .count(),
1495            1
1496        );
1497
1498        // Test literal block scalar with keep chomping
1499        let input = "text: |+\n  content";
1500        let tokens = lex(input);
1501        assert_eq!(count_kind(&tokens, SyntaxKind::PIPE), 1);
1502        assert_eq!(count_kind(&tokens, SyntaxKind::PLUS), 1);
1503
1504        // Test folded block scalar with strip chomping
1505        let input = "text: >-\n  content";
1506        let tokens = lex(input);
1507        assert_eq!(count_kind(&tokens, SyntaxKind::GREATER), 1);
1508        assert_eq!(
1509            tokens
1510                .iter()
1511                .filter(|(kind, text)| *kind == SyntaxKind::STRING && *text == "-")
1512                .count(),
1513            1
1514        );
1515
1516        // Test with explicit indentation and chomping
1517        let input = "text: |2-\n  content";
1518        let tokens = lex(input);
1519        assert_eq!(count_kind(&tokens, SyntaxKind::PIPE), 1);
1520        // The "2-" after pipe gets read as one token because hyphens in scalars are included
1521        let has_2_token = tokens.iter().any(|(kind, text)| {
1522            (*kind == SyntaxKind::STRING || *kind == SyntaxKind::INT) && text.contains("2")
1523        });
1524        assert!(has_2_token, "expected a token containing '2'");
1525    }
1526
1527    #[test]
1528    fn test_dash_edge_cases() {
1529        // Test trailing hyphen
1530        let input = "value-";
1531        let tokens = lex(input);
1532        assert_eq!(tokens[0], (SyntaxKind::STRING, "value-"));
1533
1534        // Test leading hyphen (not a sequence marker)
1535        let input = "-value";
1536        let tokens = lex(input);
1537        assert_eq!(tokens[0], (SyntaxKind::STRING, "-value"));
1538
1539        // Test multiple consecutive hyphens in scalar
1540        let input = "key: a---b";
1541        let tokens = lex(input);
1542        assert_eq!(
1543            tokens
1544                .iter()
1545                .filter(|(kind, text)| *kind == SyntaxKind::STRING && *text == "a---b")
1546                .count(),
1547            1
1548        );
1549
1550        // Test hyphen at end of line
1551        let input = "key: value-\nnext: item";
1552        let tokens = lex(input);
1553        assert!(tokens
1554            .iter()
1555            .any(|(kind, text)| *kind == SyntaxKind::STRING && *text == "value-"));
1556
1557        // Test mix of dashes and underscores
1558        let input = "snake_case-with-dash_mix";
1559        let tokens = lex(input);
1560        assert_eq!(tokens[0], (SyntaxKind::STRING, "snake_case-with-dash_mix"));
1561    }
1562
1563    #[test]
1564    fn test_whitespace_validation_tab_indentation() {
1565        // Test tab character validation
1566        let input_with_tabs = "key: value\n\tindented_key: indented_value";
1567        let (tokens, errors) = lex_with_validation(input_with_tabs);
1568
1569        // Should have detected tab indentation error
1570        assert_eq!(errors.len(), 1);
1571        assert_eq!(errors[0].category, WhitespaceErrorCategory::TabIndentation);
1572        assert_eq!(
1573            errors[0].message,
1574            "Tab character used for indentation (forbidden in YAML)"
1575        );
1576
1577        // But should still tokenize correctly
1578        assert!(tokens
1579            .iter()
1580            .any(|(kind, text)| *kind == SyntaxKind::INDENT && text.contains('\t')));
1581    }
1582
1583    #[test]
1584    fn test_whitespace_validation_line_endings() {
1585        // Test mixed line ending detection
1586        let input_mixed = "line1\nline2\r\nline3\rline4";
1587        let config = ValidationConfig {
1588            enforce_consistent_line_endings: true,
1589            max_line_length: None,
1590        };
1591        let (tokens, errors) = lex_with_validation_config(input_mixed, &config);
1592
1593        // Should detect mixed line endings
1594        assert!(errors
1595            .iter()
1596            .any(|e| e.category == WhitespaceErrorCategory::MixedLineEndings));
1597
1598        // Should still tokenize all line endings
1599        let newlines: Vec<_> = tokens
1600            .iter()
1601            .filter(|(kind, _)| *kind == SyntaxKind::NEWLINE)
1602            .collect();
1603        assert_eq!(newlines.len(), 3); // Three line endings
1604        assert_eq!(newlines[0].1, "\n");
1605        assert_eq!(newlines[1].1, "\r\n");
1606        assert_eq!(newlines[2].1, "\r");
1607    }
1608
1609    #[test]
1610    fn test_whitespace_validation_line_length() {
1611        // Test line length validation
1612        let long_line = format!("key: {}", "a".repeat(150));
1613        let config = ValidationConfig {
1614            enforce_consistent_line_endings: false,
1615            max_line_length: Some(120),
1616        };
1617        let (_, errors) = lex_with_validation_config(&long_line, &config);
1618
1619        // Should detect line too long
1620        assert_eq!(errors.len(), 1);
1621        assert_eq!(errors[0].category, WhitespaceErrorCategory::LineTooLong);
1622        assert_eq!(errors[0].message, "Line too long (155 > 120 characters)");
1623    }
1624
1625    #[test]
1626    fn test_whitespace_validation_disabled() {
1627        // Test with validation disabled
1628        let input_with_issues = "key: value\n\tindented: with_tabs\n";
1629        let config = ValidationConfig {
1630            enforce_consistent_line_endings: false,
1631            max_line_length: None,
1632        };
1633        let (tokens, errors) = lex_with_validation_config(input_with_issues, &config);
1634
1635        // Should still detect tab indentation (always enforced in YAML)
1636        assert_eq!(errors.len(), 1);
1637        assert_eq!(errors[0].category, WhitespaceErrorCategory::TabIndentation);
1638
1639        // Should tokenize normally
1640        assert!(!tokens.is_empty());
1641    }
1642
1643    #[test]
1644    fn test_dash_in_flow_collections() {
1645        // Test dash in flow sequence
1646        let input = "[item-one, item-two]";
1647        let tokens = lex(input);
1648        assert_eq!(tokens[0], (SyntaxKind::LEFT_BRACKET, "["));
1649        assert_eq!(tokens[1], (SyntaxKind::STRING, "item-one"));
1650        assert_eq!(tokens[2], (SyntaxKind::COMMA, ","));
1651        assert_eq!(tokens[4], (SyntaxKind::STRING, "item-two"));
1652        assert_eq!(tokens[5], (SyntaxKind::RIGHT_BRACKET, "]"));
1653
1654        // Test dash in flow mapping
1655        let input = "{kebab-key: kebab-value}";
1656        let tokens = lex(input);
1657        assert_eq!(tokens[0], (SyntaxKind::LEFT_BRACE, "{"));
1658        assert_eq!(tokens[1], (SyntaxKind::STRING, "kebab-key"));
1659        assert_eq!(tokens[2], (SyntaxKind::COLON, ":"));
1660        assert_eq!(tokens[4], (SyntaxKind::STRING, "kebab-value"));
1661        assert_eq!(tokens[5], (SyntaxKind::RIGHT_BRACE, "}"));
1662    }
1663
1664    #[test]
1665    fn test_dash_with_quotes() {
1666        // Quoted strings should preserve everything inside as STRING tokens
1667        let input = r#"key: "- not a sequence marker""#;
1668        let tokens = lex(input);
1669        assert_eq!(
1670            tokens
1671                .iter()
1672                .filter(|(kind, text)| {
1673                    *kind == SyntaxKind::STRING && *text == "\"- not a sequence marker\""
1674                })
1675                .count(),
1676            1
1677        );
1678
1679        let input = r#"key: '- also not a sequence marker'"#;
1680        let tokens = lex(input);
1681        assert_eq!(
1682            tokens
1683                .iter()
1684                .filter(|(kind, text)| {
1685                    *kind == SyntaxKind::STRING && *text == "'- also not a sequence marker'"
1686                })
1687                .count(),
1688            1
1689        );
1690    }
1691
1692    #[test]
1693    fn test_dash_in_multiline_values() {
1694        // Test multiline with dashes
1695        let input = "description: This is a multi-\n  line value with dashes";
1696        let tokens = lex(input);
1697        assert!(tokens
1698            .iter()
1699            .any(|(kind, text)| *kind == SyntaxKind::STRING && *text == "multi-"));
1700
1701        // Test continuation with sequence-like line
1702        let input = "text: value\n  - but this is not a sequence";
1703        let tokens = lex(input);
1704        // The dash after indentation should be treated as a sequence marker
1705        let indent_dash: Vec<_> = tokens
1706            .windows(2)
1707            .filter(|w| w[0].0 == SyntaxKind::INDENT && w[1].0 == SyntaxKind::DASH)
1708            .collect();
1709        assert_eq!(indent_dash.len(), 1);
1710    }
1711
1712    #[test]
1713    fn test_dash_special_yaml_values() {
1714        // Test that special YAML values with dashes work
1715        let input = "date: 2024-01-15";
1716        let tokens = lex(input);
1717        assert!(tokens
1718            .iter()
1719            .any(|(kind, text)| *kind == SyntaxKind::STRING && *text == "2024-01-15"));
1720
1721        // Test ISO timestamp - gets tokenized as multiple parts due to hyphens
1722        let input = "timestamp: 2024-01-15T10:30:00-05:00";
1723        let tokens = lex(input);
1724        // The timestamp is split into multiple tokens but parses correctly
1725        assert!(tokens.iter().any(
1726            |(kind, text)| *kind == SyntaxKind::STRING && *text == "2024-01-15T10:30:00-05:00"
1727        ));
1728
1729        // Test version strings
1730        let input = "version: 1.0.0-beta.1";
1731        let tokens = lex(input);
1732        assert!(tokens
1733            .iter()
1734            .any(|(kind, text)| *kind == SyntaxKind::STRING && *text == "1.0.0-beta.1"));
1735    }
1736
1737    #[test]
1738    fn test_flow_indicators_in_block_scalar() {
1739        // Flow indicators should be allowed in block context scalars
1740        // This is valid YAML: the curly braces are part of the scalar value
1741        let input = "key: unix:///Users/${metadata.username}/path";
1742        let tokens = lex(input);
1743        assert_eq!(tokens.len(), 4);
1744        assert_eq!(tokens[0], (SyntaxKind::STRING, "key"));
1745        assert_eq!(tokens[1], (SyntaxKind::COLON, ":"));
1746        assert_eq!(tokens[2], (SyntaxKind::WHITESPACE, " "));
1747        assert_eq!(
1748            tokens[3],
1749            (
1750                SyntaxKind::STRING,
1751                "unix:///Users/${metadata.username}/path"
1752            )
1753        );
1754    }
1755}
yaml_edit/lex.rs

yaml_edit/
lex.rs