Skip to main content

yaml_edit/
validator.rs

1//! YAML specification validator
2//!
3//! This module provides strict YAML 1.2 specification validation.
4//! While the parser is lenient and focuses on error recovery,
5//! this validator enforces strict spec compliance.
6//!
7//! ## Usage
8//!
9//! ```ignore
10//! use yaml_edit::{Yaml, validator::Validator};
11//!
12//! let yaml = Yaml::parse("some: yaml");
13//! let validator = Validator::new();
14//! let violations = validator.validate(&yaml);
15//!
16//! if violations.is_empty() {
17//!     println!("Strictly spec-compliant!");
18//! } else {
19//!     for violation in violations {
20//!         println!("{}", violation);
21//!     }
22//! }
23//! ```
24
25use crate::yaml::{Document, SyntaxNode};
26use rowan::ast::AstNode;
27use std::fmt;
28
29/// A YAML specification violation found during validation
30#[derive(Debug, Clone, PartialEq, Eq)]
31pub struct Violation {
32    /// Human-readable description of the violation
33    pub message: String,
34    /// Location in the source (line:column format)
35    pub location: Option<String>,
36    /// Byte range in the source text where the violation occurred
37    pub text_range: Option<crate::TextPosition>,
38    /// Severity of the violation
39    pub severity: Severity,
40    /// Specific rule that was violated
41    pub rule: Rule,
42}
43
44/// Severity level of a spec violation
45#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
46pub enum Severity {
47    /// Error: Strictly invalid per YAML 1.2 spec
48    Error,
49    /// Warning: Deprecated or discouraged but technically valid
50    Warning,
51}
52
53/// Specific YAML spec rules that can be violated
54#[derive(Debug, Clone, Copy, PartialEq, Eq)]
55pub enum Rule {
56    /// Invalid indentation
57    InvalidIndentation,
58    /// Document markers in wrong context
59    InvalidDocumentMarker,
60    /// Invalid tab usage
61    InvalidTabUsage,
62    /// Missing required syntax elements
63    MissingSyntax,
64    /// Invalid escape sequence
65    InvalidEscape,
66    /// Duplicate keys in mapping
67    DuplicateKeys,
68    /// Invalid anchor/alias usage
69    InvalidAnchor,
70    /// Invalid tag
71    InvalidTag,
72    /// Other spec violations
73    Other,
74}
75
76impl fmt::Display for Violation {
77    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
78        match &self.location {
79            Some(loc) => write!(
80                f,
81                "[{}] {}: {} ({:?})",
82                match self.severity {
83                    Severity::Error => "ERROR",
84                    Severity::Warning => "WARN",
85                },
86                loc,
87                self.message,
88                self.rule
89            ),
90            None => write!(
91                f,
92                "[{}] {} ({:?})",
93                match self.severity {
94                    Severity::Error => "ERROR",
95                    Severity::Warning => "WARN",
96                },
97                self.message,
98                self.rule
99            ),
100        }
101    }
102}
103
104/// YAML 1.2 specification validator
105///
106/// Performs strict validation checks on parsed YAML documents.
107/// The parser itself is lenient and focuses on error recovery,
108/// while this validator enforces strict spec compliance.
109pub struct Validator {
110    /// Configuration options
111    config: ValidatorConfig,
112}
113
114/// Configuration for the validator
115#[derive(Debug, Clone)]
116pub struct ValidatorConfig {
117    /// Check for duplicate keys in mappings
118    pub check_duplicate_keys: bool,
119    /// Check indentation rules
120    pub check_indentation: bool,
121    /// Check tab usage restrictions
122    pub check_tabs: bool,
123    /// Check document marker placement
124    pub check_document_markers: bool,
125    /// Check anchor/alias validity
126    pub check_anchors: bool,
127}
128
129impl Default for ValidatorConfig {
130    fn default() -> Self {
131        Self {
132            check_duplicate_keys: true,
133            check_indentation: true,
134            check_tabs: true,
135            check_document_markers: true,
136            check_anchors: true,
137        }
138    }
139}
140
141/// Walk up to the ROOT node from the given node, or return the node itself if it is already ROOT.
142fn find_root(node: &SyntaxNode) -> SyntaxNode {
143    if node.kind() == crate::SyntaxKind::ROOT {
144        return node.clone();
145    }
146    node.ancestors()
147        .find(|n| n.kind() == crate::SyntaxKind::ROOT)
148        .unwrap_or_else(|| node.clone())
149}
150
151/// Convert a rowan TextRange to a TextPosition.
152fn range_to_text_position(range: rowan::TextRange) -> crate::TextPosition {
153    crate::TextPosition::new(u32::from(range.start()), u32::from(range.end()))
154}
155
156impl Validator {
157    /// Create a new validator with default configuration
158    pub fn new() -> Self {
159        Self {
160            config: ValidatorConfig::default(),
161        }
162    }
163
164    /// Create a validator with custom configuration
165    pub fn with_config(config: ValidatorConfig) -> Self {
166        Self { config }
167    }
168
169    /// Validate a YAML document against YAML 1.2 spec
170    ///
171    /// Returns a list of spec violations. Empty list means strictly compliant.
172    pub fn validate(&self, doc: &Document) -> Vec<Violation> {
173        let mut violations = Vec::new();
174
175        // Check for duplicate directives at document level
176        self.check_duplicate_directives(doc.syntax(), &mut violations);
177
178        // Check for directive without document content
179        self.check_directive_without_document(doc.syntax(), &mut violations);
180
181        // Walk the syntax tree and check for violations
182        // This will catch ERROR nodes created by parser (including content after doc end)
183        self.validate_node(doc.syntax(), &mut violations);
184
185        violations
186    }
187
188    /// Validate from a syntax node (can be ROOT, DOCUMENT, or any node)
189    ///
190    /// This is useful when you need to validate the full parse tree including directives
191    /// that may not be attached to a specific document.
192    pub fn validate_syntax(&self, node: &SyntaxNode) -> Vec<Violation> {
193        let mut violations = Vec::new();
194
195        // Check for duplicate directives
196        self.check_duplicate_directives(node, &mut violations);
197
198        // Check for directives without document content
199        self.check_directives_at_root(node, &mut violations);
200
201        // Check for directives after documents without document end marker
202        self.check_directive_after_document(node, &mut violations);
203
204        // Walk the syntax tree and check for violations
205        self.validate_node(node, &mut violations);
206
207        violations
208    }
209
210    fn validate_node(&self, node: &SyntaxNode, violations: &mut Vec<Violation>) {
211        use crate::SyntaxKind;
212
213        // Check for tabs in any node's tokens
214        if self.config.check_tabs {
215            self.check_tab_usage(node, violations);
216        }
217
218        // Check current node type
219        // Check for multiple anchor tokens on any node (anchors are tokens, not nodes)
220        if self.config.check_anchors {
221            self.check_multiple_anchors(node, violations);
222        }
223
224        match node.kind() {
225            SyntaxKind::ERROR => {
226                // Parser has marked this as erroneous content
227                // Report it as a validation error
228                let content = node.text().to_string();
229                let preview = if content.len() > 50 {
230                    format!("{}...", &content[..50])
231                } else {
232                    content
233                };
234                violations.push(Violation {
235                    message: format!("Invalid content in document: {:?}", preview),
236                    location: None,
237                    text_range: Some(range_to_text_position(node.text_range())),
238                    severity: Severity::Error,
239                    rule: Rule::Other,
240                });
241            }
242            SyntaxKind::MAPPING_ENTRY => {
243                // Check for multiline implicit keys
244                self.check_implicit_key_multiline(node, violations);
245                // Check for block sequence on same line as mapping key
246                self.check_sequence_on_same_line_as_key(node, violations);
247            }
248            SyntaxKind::SCALAR => {
249                // Check for invalid escape sequences in quoted strings
250                self.check_escape_sequences(node, violations);
251                // Check for content on same line as block scalar indicator
252                self.check_block_scalar_indicator(node, violations);
253                // Check for trailing content after quoted strings
254                self.check_trailing_content_after_quoted(node, violations);
255                // Check for colons in plain scalar values
256                self.check_colon_in_plain_scalar(node, violations);
257                // Check for document markers inside quoted strings
258                self.check_document_marker_in_string(node, violations);
259                // Check for directives inside document content (e.g. %YAML after ---)
260                self.check_directive_in_content(node, violations);
261            }
262            SyntaxKind::DOC_START | SyntaxKind::DOC_END if self.config.check_document_markers => {
263                self.check_document_marker_placement(node, violations);
264            }
265            SyntaxKind::MAPPING => {
266                self.check_flow_collection_commas(node, violations);
267                self.check_block_mapping_entries_on_same_line(node, violations);
268                if self.config.check_duplicate_keys {
269                    self.check_duplicate_keys(node, violations);
270                }
271            }
272            SyntaxKind::SEQUENCE => {
273                self.check_flow_collection_commas(node, violations);
274                self.check_sequence_entry_in_flow(node, violations);
275            }
276            SyntaxKind::VALUE => {
277                self.check_anchor_and_alias(node, violations);
278            }
279            SyntaxKind::DOCUMENT => {
280                self.check_document_level_anchors(node, violations);
281            }
282            _ => {}
283        }
284
285        // Check tokens (like COMMENT, DOC_START, TAG) that are children but not nodes
286        for element in node.children_with_tokens() {
287            if let Some(token) = element.as_token() {
288                // Check COMMENT tokens for whitespace separation
289                if token.kind() == crate::SyntaxKind::COMMENT {
290                    self.check_comment_token_whitespace(token, violations);
291                }
292                // Check DOC_START tokens for content on same line
293                if token.kind() == crate::SyntaxKind::DOC_START {
294                    self.check_doc_start_token_content(token, violations);
295                }
296                // Check TAG tokens for invalid characters
297                if token.kind() == crate::SyntaxKind::TAG {
298                    self.check_tag_characters(token, violations);
299                    self.check_tag_followed_by_comma(token, violations);
300                }
301            }
302        }
303
304        // Check indentation rules
305        self.check_sequence_indentation(node, violations);
306        self.check_quoted_string_indentation(node, violations);
307
308        // Recursively validate child nodes
309        for child in node.children() {
310            self.validate_node(&child, violations);
311        }
312    }
313
314    /// Check for directive without document content
315    fn check_directive_without_document(
316        &self,
317        doc_node: &SyntaxNode,
318        violations: &mut Vec<Violation>,
319    ) {
320        let root = find_root(doc_node);
321
322        // Check if there are any DIRECTIVE nodes
323        let has_directives = root
324            .descendants()
325            .any(|n| n.kind() == crate::SyntaxKind::DIRECTIVE);
326
327        if !has_directives {
328            return;
329        }
330
331        // Check if the document has any actual content
332        // A document with only whitespace, newlines, or document markers is considered empty
333        let has_content = doc_node.descendants().any(|n| {
334            matches!(
335                n.kind(),
336                crate::SyntaxKind::MAPPING
337                    | crate::SyntaxKind::SEQUENCE
338                    | crate::SyntaxKind::SCALAR
339                    | crate::SyntaxKind::STRING
340                    | crate::SyntaxKind::TAGGED_NODE
341            )
342        });
343
344        if !has_content {
345            violations.push(Violation {
346                message: "Directive requires a document with content".to_string(),
347                location: None,
348                text_range: None,
349                severity: Severity::Error,
350                rule: Rule::Other,
351            });
352        }
353    }
354
355    /// Check for directives at root level without following document
356    ///
357    /// This checks if the ROOT node has DIRECTIVE children but no DOCUMENT children with content.
358    fn check_directives_at_root(&self, node: &SyntaxNode, violations: &mut Vec<Violation>) {
359        use crate::SyntaxKind;
360
361        let check_node = find_root(node);
362
363        // Check if there are any DIRECTIVE children
364        let has_directives = check_node
365            .children()
366            .any(|child| child.kind() == SyntaxKind::DIRECTIVE);
367
368        if !has_directives {
369            return;
370        }
371
372        // Check if there's a DOCUMENT child with actual content
373        let has_document_with_content = check_node.children().any(|child| {
374            if child.kind() == SyntaxKind::DOCUMENT {
375                // Check if this document has content
376                child.descendants().any(|n| {
377                    matches!(
378                        n.kind(),
379                        SyntaxKind::MAPPING
380                            | SyntaxKind::SEQUENCE
381                            | SyntaxKind::SCALAR
382                            | SyntaxKind::STRING
383                            | SyntaxKind::TAGGED_NODE
384                    )
385                })
386            } else {
387                false
388            }
389        });
390
391        if !has_document_with_content {
392            violations.push(Violation {
393                message: "Directive without document content".to_string(),
394                location: None,
395                text_range: None,
396                severity: Severity::Error,
397                rule: Rule::Other,
398            });
399        }
400    }
401
402    /// Check for directives appearing after documents without document end marker (...)
403    ///
404    /// Per YAML spec, if a directive appears after document content, the document
405    /// must be explicitly ended with `...` before the directive.
406    fn check_directive_after_document(&self, node: &SyntaxNode, violations: &mut Vec<Violation>) {
407        use crate::SyntaxKind;
408
409        let check_node = find_root(node);
410
411        // Track if we've seen a document with content
412        let mut seen_document_with_content = false;
413
414        for child in check_node.children() {
415            match child.kind() {
416                SyntaxKind::DOCUMENT => {
417                    // Check if this document has content
418                    let has_content = child.descendants().any(|n| {
419                        matches!(
420                            n.kind(),
421                            SyntaxKind::MAPPING
422                                | SyntaxKind::SEQUENCE
423                                | SyntaxKind::SCALAR
424                                | SyntaxKind::STRING
425                                | SyntaxKind::TAGGED_NODE
426                        )
427                    });
428
429                    // Check if this document has a DOC_END marker
430                    let has_doc_end = child
431                        .children_with_tokens()
432                        .any(|t| t.kind() == SyntaxKind::DOC_END);
433
434                    if has_content {
435                        seen_document_with_content = true;
436
437                        // If this document doesn't end with ..., mark that we need one
438                        // before any subsequent directives
439                        if !has_doc_end {
440                            // This document has no end marker - any following directive is invalid
441                            // (we'll check this when we encounter the directive)
442                        }
443                    }
444                }
445                SyntaxKind::DIRECTIVE if seen_document_with_content => {
446                    // If we've seen a document with content and the last document didn't have DOC_END
447                    // Check if the previous DOCUMENT had a DOC_END
448                    let mut prev_sibling = child.prev_sibling();
449                    let mut found_doc_with_end = false;
450
451                    while let Some(prev) = prev_sibling {
452                        if prev.kind() == SyntaxKind::DOCUMENT {
453                            // Check if this document has DOC_END
454                            let has_doc_end = prev
455                                .children_with_tokens()
456                                .any(|t| t.kind() == SyntaxKind::DOC_END);
457
458                            if has_doc_end {
459                                found_doc_with_end = true;
460                            }
461                            break;
462                        }
463                        prev_sibling = prev.prev_sibling();
464                    }
465
466                    if !found_doc_with_end {
467                        violations.push(Violation {
468                            message: "Directive after document requires document end marker (...)"
469                                .to_string(),
470                            location: None,
471                            text_range: None,
472                            severity: Severity::Error,
473                            rule: Rule::Other,
474                        });
475                    }
476                }
477                _ => {}
478            }
479        }
480    }
481
482    /// Check for directive tokens inside document content.
483    ///
484    /// When the parser encounters `%YAML 1.2` after a `---` without a preceding `...`,
485    /// it parses the directive as scalar content. This check catches that case.
486    fn check_directive_in_content(&self, node: &SyntaxNode, violations: &mut Vec<Violation>) {
487        use crate::SyntaxKind;
488
489        let has_directive = node
490            .children_with_tokens()
491            .any(|c| c.kind() == SyntaxKind::DIRECTIVE);
492        if has_directive {
493            violations.push(Violation {
494                message: "Directive in document content (missing document end marker `...` before directive)".to_string(),
495                location: None,
496                    text_range: Some(range_to_text_position(node.text_range())),
497                severity: Severity::Error,
498                rule: Rule::Other,
499            });
500        }
501    }
502
503    /// Check for duplicate YAML directives
504    fn check_duplicate_directives(&self, doc_node: &SyntaxNode, violations: &mut Vec<Violation>) {
505        use std::collections::HashMap;
506
507        let root = find_root(doc_node);
508
509        // Collect all directives and count by type
510        let mut directive_counts: HashMap<String, usize> = HashMap::new();
511
512        for node in root.descendants() {
513            if node.kind() == crate::SyntaxKind::DIRECTIVE {
514                // Get the directive text (e.g., "%YAML 1.2" or "%TAG ! tag:yaml.org,2002:")
515                let text = node.text().to_string();
516
517                // Extract directive type (YAML, TAG, etc.)
518                if let Some(directive_type) = text.split_whitespace().next() {
519                    *directive_counts
520                        .entry(directive_type.to_string())
521                        .or_insert(0) += 1;
522                }
523            }
524        }
525
526        // Check for duplicates
527        for (directive_type, count) in directive_counts {
528            if count > 1 {
529                violations.push(Violation {
530                    message: format!("Duplicate {} directive", directive_type),
531                    location: None,
532                    text_range: None,
533                    severity: Severity::Error,
534                    rule: Rule::Other,
535                });
536            }
537        }
538    }
539
540    /// Check for multiple anchors on the same node
541    fn check_multiple_anchors(&self, node: &SyntaxNode, violations: &mut Vec<Violation>) {
542        // Count ANCHOR tokens (not nodes) in this node's children
543        let anchor_count = node
544            .children_with_tokens()
545            .filter(|child| {
546                child
547                    .as_token()
548                    .is_some_and(|t| t.kind() == crate::SyntaxKind::ANCHOR)
549            })
550            .count();
551
552        if anchor_count > 1 {
553            violations.push(Violation {
554                message: "Multiple anchors on the same node".to_string(),
555                location: None,
556                text_range: Some(range_to_text_position(node.text_range())),
557                severity: Severity::Error,
558                rule: Rule::InvalidAnchor,
559            });
560        }
561    }
562
563    /// Check for invalid escape sequences in quoted strings
564    fn check_escape_sequences(&self, node: &SyntaxNode, violations: &mut Vec<Violation>) {
565        // Check first character to see if this is a quoted string - no allocation needed
566        let first_char = node.first_token().and_then(|t| t.text().chars().next());
567        if first_char != Some('"') {
568            return;
569        }
570
571        // Scan for escape sequences (requires one allocation for the text)
572        let text = node.text().to_string();
573        let mut chars = text.chars().peekable();
574
575        while let Some(ch) = chars.next() {
576            if ch == '\\' {
577                if let Some(&next) = chars.peek() {
578                    // Valid escape sequences in YAML 1.2
579                    let valid_escapes = [
580                        '0', 'a', 'b', 't', 'n', 'v', 'f', 'r', 'e', ' ', '"', '/', '\\', 'N', '_',
581                        'L', 'P', 'x', 'u', 'U',
582                    ];
583
584                    if !valid_escapes.contains(&next) {
585                        violations.push(Violation {
586                            message: format!("Invalid escape sequence: \\{}", next),
587                            location: None,
588                            text_range: Some(range_to_text_position(node.text_range())),
589                            severity: Severity::Error,
590                            rule: Rule::InvalidEscape,
591                        });
592                        return; // Found one, no need to continue
593                    }
594                }
595            }
596        }
597    }
598
599    /// Check for content on the same line as block scalar indicator (| or >)
600    ///
601    /// Per YAML spec, block scalar content must start on the line after the indicator.
602    /// Only chomping indicators (+/-) and indentation indicators (1-9) are allowed
603    /// on the same line as the block scalar indicator.
604    fn check_block_scalar_indicator(&self, node: &SyntaxNode, violations: &mut Vec<Violation>) {
605        // Check if this scalar has a GREATER (folded) or PIPE (literal) indicator
606        let has_block_indicator = node.children_with_tokens().any(|child| {
607            if let rowan::NodeOrToken::Token(token) = child {
608                matches!(
609                    token.kind(),
610                    crate::SyntaxKind::GREATER | crate::SyntaxKind::PIPE
611                )
612            } else {
613                false
614            }
615        });
616
617        if !has_block_indicator {
618            return;
619        }
620
621        // Check if any STRING tokens appear before the first NEWLINE after the indicator
622        let mut found_indicator = false;
623        let mut found_newline = false;
624
625        for child in node.children_with_tokens() {
626            if let rowan::NodeOrToken::Token(token) = child {
627                // Mark when we find the block indicator
628                if matches!(
629                    token.kind(),
630                    crate::SyntaxKind::GREATER | crate::SyntaxKind::PIPE
631                ) {
632                    found_indicator = true;
633                    continue;
634                }
635
636                // After indicator, before newline
637                if found_indicator && !found_newline {
638                    match token.kind() {
639                        crate::SyntaxKind::NEWLINE => {
640                            found_newline = true;
641                        }
642                        crate::SyntaxKind::STRING => {
643                            // Found content on same line as indicator
644                            violations.push(Violation {
645                                message:
646                                    "Block scalar content cannot appear on same line as indicator"
647                                        .to_string(),
648                                location: None,
649                                text_range: None,
650                                severity: Severity::Error,
651                                rule: Rule::Other,
652                            });
653                            return;
654                        }
655                        // WHITESPACE, COMMENT, and chomping/indentation indicators are OK
656                        _ => {}
657                    }
658                }
659            }
660        }
661    }
662
663    /// Check for trailing content after quoted strings
664    ///
665    /// After a quoted string (double or single) closes, only whitespace, newlines,
666    /// or comments should follow. Additional content on the same line is invalid.
667    fn check_trailing_content_after_quoted(
668        &self,
669        node: &SyntaxNode,
670        violations: &mut Vec<Violation>,
671    ) {
672        let mut found_quoted = false;
673        let mut found_quote_end = false;
674        let mut found_newline = false;
675
676        for child in node.children_with_tokens() {
677            if let rowan::NodeOrToken::Token(token) = child {
678                match token.kind() {
679                    crate::SyntaxKind::STRING => {
680                        let text = token.text();
681
682                        // Check if this is a quoted string (starts with " or ')
683                        if !found_quoted && (text.starts_with('"') || text.starts_with('\'')) {
684                            found_quoted = true;
685
686                            // Check if quote ends in this same token
687                            if text.len() > 1 && (text.ends_with('"') || text.ends_with('\'')) {
688                                found_quote_end = true;
689                            }
690                        } else if found_quoted && !found_quote_end {
691                            // Still inside the quoted string
692                            if text.ends_with('"') || text.ends_with('\'') {
693                                found_quote_end = true;
694                            }
695                        } else if found_quote_end && !found_newline {
696                            // Found content after quoted string ended, before newline
697                            violations.push(Violation {
698                                message: "Trailing content after quoted string".to_string(),
699                                location: None,
700                                text_range: None,
701                                severity: Severity::Error,
702                                rule: Rule::Other,
703                            });
704                            return;
705                        }
706                    }
707                    crate::SyntaxKind::NEWLINE => {
708                        found_newline = true;
709                    }
710                    crate::SyntaxKind::WHITESPACE | crate::SyntaxKind::COMMENT => {
711                        // These are allowed after quoted strings
712                    }
713                    _ => {}
714                }
715            }
716        }
717    }
718
719    /// Check for colons in plain scalar values
720    ///
721    /// Plain scalars in block context cannot contain `: ` (colon followed by space)
722    /// without being quoted. This indicates an attempt to create a nested mapping
723    /// within a plain scalar, which is invalid.
724    fn check_colon_in_plain_scalar(&self, node: &SyntaxNode, violations: &mut Vec<Violation>) {
725        // Check if this scalar contains COLON tokens
726        let has_colon = node.children_with_tokens().any(|child| {
727            if let rowan::NodeOrToken::Token(token) = child {
728                token.kind() == crate::SyntaxKind::COLON
729            } else {
730                false
731            }
732        });
733
734        if !has_colon {
735            return;
736        }
737
738        // Check if this is a quoted string (which can contain colons)
739        let is_quoted = node.first_token().is_some_and(|t| {
740            let text = t.text();
741            text.starts_with('"') || text.starts_with('\'')
742        });
743
744        if is_quoted {
745            return;
746        }
747
748        // Check if this scalar is inside a VALUE node (not a KEY)
749        // Keys can have plain text without issues, but values with colons need special handling
750        let parent_is_value = node
751            .parent()
752            .is_some_and(|p| p.kind() == crate::SyntaxKind::VALUE);
753
754        if parent_is_value {
755            violations.push(Violation {
756                message: "Plain scalar value cannot contain mapping syntax (colon)".to_string(),
757                location: None,
758                text_range: None,
759                severity: Severity::Error,
760                rule: Rule::Other,
761            });
762        }
763    }
764
765    /// Check for document markers (--- or ...) appearing in quoted strings
766    ///
767    /// Document markers on their own line should always be recognized as document
768    /// boundaries, even if they appear to be within a quoted string. A quoted string
769    /// containing "\n---\n" or "\n...\n" is invalid.
770    fn check_document_marker_in_string(&self, node: &SyntaxNode, violations: &mut Vec<Violation>) {
771        // Get the text of the scalar
772        let text = node.text().to_string();
773
774        // Check if this is a quoted string
775        if !text.starts_with('"') && !text.starts_with('\'') {
776            return;
777        }
778
779        // Check for document markers on their own line within the string
780        // Look for \n--- or \n... where the marker is followed by \n or end of string
781        if text.contains("\n---\n")
782            || text.contains("\n---\"")
783            || text.contains("\n---'")
784            || text.contains("\n...\n")
785            || text.contains("\n...\"")
786            || text.contains("\n...'")
787        {
788            violations.push(Violation {
789                message: "Document marker on its own line inside quoted string".to_string(),
790                location: None,
791                text_range: Some(range_to_text_position(node.text_range())),
792                severity: Severity::Error,
793                rule: Rule::InvalidDocumentMarker,
794            });
795        }
796    }
797
798    /// Check for tab usage in whitespace nodes
799    fn check_tab_usage(&self, node: &SyntaxNode, violations: &mut Vec<Violation>) {
800        // Only check whitespace nodes - more efficient than serializing everything
801        // Check each token directly without allocation
802        for token in node.children_with_tokens() {
803            if let rowan::NodeOrToken::Token(token) = token {
804                // Check the token text directly - this is a cheap slice operation
805                if token.text().contains('\t') {
806                    violations.push(Violation {
807                        message: "Tabs are not allowed for indentation in YAML".to_string(),
808                        location: None,
809                        text_range: Some(range_to_text_position(token.text_range())),
810                        severity: Severity::Error,
811                        rule: Rule::InvalidTabUsage,
812                    });
813                    return; // Found one, no need to keep checking
814                }
815            }
816        }
817    }
818
819    /// Check document marker placement
820    fn check_document_marker_placement(&self, node: &SyntaxNode, violations: &mut Vec<Violation>) {
821        // Document markers should only appear at document boundaries
822        // Check if marker is in inappropriate context (e.g., inside a quoted string)
823        if let Some(parent) = node.parent() {
824            if matches!(
825                parent.kind(),
826                crate::SyntaxKind::STRING | crate::SyntaxKind::SCALAR
827            ) {
828                violations.push(Violation {
829                    message: "Document marker inside string is invalid".to_string(),
830                    location: None,
831                    text_range: Some(range_to_text_position(node.text_range())),
832                    severity: Severity::Error,
833                    rule: Rule::InvalidDocumentMarker,
834                });
835            }
836        }
837    }
838
839    /// Check for missing commas in flow collections
840    fn check_flow_collection_commas(&self, node: &SyntaxNode, violations: &mut Vec<Violation>) {
841        // Check first token to see if this is a flow collection - avoid full serialization
842        let first_token = node.first_token();
843        let is_flow_mapping = first_token.as_ref().is_some_and(|t| t.text() == "{");
844        let is_flow_sequence = first_token.as_ref().is_some_and(|t| t.text() == "[");
845
846        if !is_flow_mapping && !is_flow_sequence {
847            return;
848        }
849
850        // Count entries and commas
851        let entry_kind = if is_flow_mapping {
852            crate::SyntaxKind::MAPPING_ENTRY
853        } else {
854            crate::SyntaxKind::SEQUENCE_ENTRY
855        };
856
857        let mut entry_count = 0;
858        let mut comma_count = 0;
859        let mut prev_was_comma = false;
860
861        for child in node.children() {
862            match child.kind() {
863                k if k == entry_kind => entry_count += 1,
864                crate::SyntaxKind::COMMA => {
865                    comma_count += 1;
866                    if prev_was_comma {
867                        violations.push(Violation {
868                            message: "Double comma in flow collection".to_string(),
869                            location: None,
870                            text_range: Some(range_to_text_position(node.text_range())),
871                            severity: Severity::Error,
872                            rule: Rule::Other,
873                        });
874                    }
875                    prev_was_comma = true;
876                }
877                crate::SyntaxKind::WHITESPACE | crate::SyntaxKind::NEWLINE => {}
878                _ => prev_was_comma = false,
879            }
880        }
881
882        // Flow collections need n-1 commas for n entries (except when trailing comma)
883        if entry_count > 1 && comma_count < entry_count - 1 {
884            violations.push(Violation {
885                message: format!(
886                    "Flow collection missing commas: {} entries but only {} commas",
887                    entry_count, comma_count
888                ),
889                location: None,
890                text_range: None,
891                severity: Severity::Error,
892                rule: Rule::MissingSyntax,
893            });
894        }
895    }
896
897    /// Check for multiple mapping entries on the same line in block mappings
898    ///
899    /// In block mappings (not flow mappings with {}), each mapping entry should
900    /// be on its own line. Multiple entries on the same line are invalid.
901    fn check_block_mapping_entries_on_same_line(
902        &self,
903        node: &SyntaxNode,
904        violations: &mut Vec<Violation>,
905    ) {
906        // Check if this is a flow mapping (which allows same-line entries)
907        let first_token = node.first_token();
908        let is_flow_mapping = first_token.as_ref().is_some_and(|t| t.text() == "{");
909
910        if is_flow_mapping {
911            return; // Flow mappings can have entries on same line
912        }
913
914        // Check for consecutive MAPPING_ENTRY nodes without NEWLINE between them
915        let mut prev_entry: Option<SyntaxNode> = None;
916
917        for child in node.children() {
918            if child.kind() == crate::SyntaxKind::MAPPING_ENTRY {
919                if let Some(prev) = prev_entry {
920                    // Check if there's a NEWLINE between prev and current entry.
921                    // The newline may be inside the previous entry (as its last
922                    // token) or between entries as a sibling token.
923                    let has_newline_between = {
924                        // First check if the previous entry ends with a newline
925                        let prev_ends_with_newline = prev
926                            .last_token()
927                            .is_some_and(|t| t.kind() == crate::SyntaxKind::NEWLINE);
928
929                        if prev_ends_with_newline {
930                            true
931                        } else {
932                            // Check sibling tokens between the entries
933                            let mut current_sibling = prev.next_sibling_or_token();
934                            let mut found_newline = false;
935
936                            while let Some(sibling) = current_sibling {
937                                if let rowan::NodeOrToken::Node(n) = &sibling {
938                                    if n == &child {
939                                        break;
940                                    }
941                                }
942
943                                if let rowan::NodeOrToken::Token(t) = &sibling {
944                                    if t.kind() == crate::SyntaxKind::NEWLINE {
945                                        found_newline = true;
946                                        break;
947                                    }
948                                }
949
950                                current_sibling = sibling.next_sibling_or_token();
951                            }
952
953                            found_newline
954                        }
955                    };
956
957                    if !has_newline_between {
958                        violations.push(Violation {
959                            message: "Block mapping entries must be on separate lines".to_string(),
960                            location: None,
961                            text_range: None,
962                            severity: Severity::Error,
963                            rule: Rule::Other,
964                        });
965                        return; // One violation is enough
966                    }
967                }
968
969                prev_entry = Some(child);
970            }
971        }
972    }
973
974    /// Check for SEQUENCE_ENTRY nodes in flow sequences
975    ///
976    /// Flow sequences (using []) should not have SEQUENCE_ENTRY children.
977    /// SEQUENCE_ENTRY is only for block sequences (using -). In flow sequences,
978    /// values appear directly without the - marker.
979    fn check_sequence_entry_in_flow(&self, node: &SyntaxNode, violations: &mut Vec<Violation>) {
980        // Check if this is a flow sequence
981        let first_token = node.first_token();
982        let is_flow_sequence = first_token.as_ref().is_some_and(|t| t.text() == "[");
983
984        if !is_flow_sequence {
985            return;
986        }
987
988        // Check for SEQUENCE_ENTRY children
989        for child in node.children() {
990            if child.kind() == crate::SyntaxKind::SEQUENCE_ENTRY {
991                violations.push(Violation {
992                    message: "Flow sequence cannot use block sequence syntax (-)".to_string(),
993                    location: None,
994                    text_range: Some(range_to_text_position(node.text_range())),
995                    severity: Severity::Error,
996                    rule: Rule::Other,
997                });
998                return; // One violation is enough
999            }
1000        }
1001    }
1002
1003    /// Check for anchors at document level without proper node attachment
1004    ///
1005    /// Anchors should be attached to nodes (values), not floating at document level.
1006    /// This is often the result of syntax errors like `&anchor - item`.
1007    fn check_document_level_anchors(&self, node: &SyntaxNode, violations: &mut Vec<Violation>) {
1008        // Check for ANCHOR tokens that are direct children of DOCUMENT
1009        for child in node.children_with_tokens() {
1010            if let rowan::NodeOrToken::Token(token) = child {
1011                if token.kind() == crate::SyntaxKind::ANCHOR {
1012                    violations.push(Violation {
1013                        message: "Anchor must be attached to a node, not at document level"
1014                            .to_string(),
1015                        location: None,
1016                        text_range: Some(range_to_text_position(token.text_range())),
1017                        severity: Severity::Error,
1018                        rule: Rule::Other,
1019                    });
1020                }
1021            }
1022        }
1023    }
1024
1025    /// Check for both anchor and alias on the same value
1026    ///
1027    /// Per YAML spec, a node can have an anchor (defining a reusable node) OR
1028    /// be an alias (referencing another node), but not both.
1029    fn check_anchor_and_alias(&self, node: &SyntaxNode, violations: &mut Vec<Violation>) {
1030        let mut has_anchor = false;
1031        let mut has_alias = false;
1032
1033        // Check for ANCHOR tokens at this level
1034        for child in node.children_with_tokens() {
1035            if let rowan::NodeOrToken::Token(token) = child {
1036                if token.kind() == crate::SyntaxKind::ANCHOR {
1037                    has_anchor = true;
1038                }
1039            }
1040        }
1041
1042        // Check for REFERENCE tokens in descendant SCALAR nodes
1043        for desc in node.descendants() {
1044            if desc.kind() == crate::SyntaxKind::SCALAR {
1045                for token in desc.children_with_tokens() {
1046                    if let rowan::NodeOrToken::Token(t) = token {
1047                        if t.kind() == crate::SyntaxKind::REFERENCE {
1048                            has_alias = true;
1049                            break;
1050                        }
1051                    }
1052                }
1053            }
1054        }
1055
1056        if has_anchor && has_alias {
1057            violations.push(Violation {
1058                message: "Node cannot have both an anchor and be an alias".to_string(),
1059                location: None,
1060                text_range: None,
1061                severity: Severity::Error,
1062                rule: Rule::Other,
1063            });
1064        }
1065    }
1066
1067    /// Check that comment tokens have whitespace separation
1068    ///
1069    /// YAML spec requires that comment markers (#) must be separated from other content
1070    /// by whitespace. This checks if a COMMENT token appears without preceding whitespace.
1071    fn check_comment_token_whitespace(
1072        &self,
1073        token: &rowan::SyntaxToken<crate::Lang>,
1074        violations: &mut Vec<Violation>,
1075    ) {
1076        // Check if there's a previous sibling token/node
1077        if let Some(prev) = token.prev_sibling_or_token() {
1078            match prev {
1079                rowan::NodeOrToken::Token(prev_token) => {
1080                    // Comment should be preceded by whitespace or newline token
1081                    if prev_token.kind() != crate::SyntaxKind::WHITESPACE
1082                        && prev_token.kind() != crate::SyntaxKind::NEWLINE
1083                    {
1084                        violations.push(Violation {
1085                            message: "Comment without whitespace separation".to_string(),
1086                            location: None,
1087                            text_range: Some(range_to_text_position(token.text_range())),
1088                            severity: Severity::Error,
1089                            rule: Rule::Other,
1090                        });
1091                    }
1092                }
1093                rowan::NodeOrToken::Node(_prev_node) => {
1094                    // If preceded by a node (not whitespace token), that's also invalid
1095                    violations.push(Violation {
1096                        message: "Comment without whitespace separation".to_string(),
1097                        location: None,
1098                        text_range: Some(range_to_text_position(token.text_range())),
1099                        severity: Severity::Error,
1100                        rule: Rule::Other,
1101                    });
1102                }
1103            }
1104        }
1105    }
1106
1107    /// Check that content doesn't appear on same line as document start marker
1108    ///
1109    /// According to YAML spec, content should not appear on the same line as
1110    /// a document start marker (---).
1111    fn check_doc_start_token_content(
1112        &self,
1113        token: &rowan::SyntaxToken<crate::Lang>,
1114        violations: &mut Vec<Violation>,
1115    ) {
1116        // Look at siblings after DOC_START token
1117        let mut found_newline = false;
1118        let mut found_content = false;
1119
1120        // Check if there's content before a newline
1121        let mut current = token.next_sibling_or_token();
1122        while let Some(sibling) = current {
1123            let next = match &sibling {
1124                rowan::NodeOrToken::Token(t) => {
1125                    match t.kind() {
1126                        crate::SyntaxKind::NEWLINE => {
1127                            found_newline = true;
1128                            break;
1129                        }
1130                        crate::SyntaxKind::WHITESPACE | crate::SyntaxKind::COMMENT => {
1131                            // Whitespace and comments are OK
1132                        }
1133                        _ => {}
1134                    }
1135                    t.next_sibling_or_token()
1136                }
1137                rowan::NodeOrToken::Node(n) => {
1138                    // Any node here means content
1139                    match n.kind() {
1140                        crate::SyntaxKind::MAPPING
1141                        | crate::SyntaxKind::SEQUENCE
1142                        | crate::SyntaxKind::SCALAR
1143                        | crate::SyntaxKind::TAGGED_NODE => {
1144                            found_content = true;
1145                            break;
1146                        }
1147                        _ => {}
1148                    }
1149                    n.next_sibling_or_token()
1150                }
1151            };
1152            current = next;
1153        }
1154
1155        if found_content && !found_newline {
1156            violations.push(Violation {
1157                message: "Content on same line as document start marker".to_string(),
1158                location: None,
1159                text_range: None,
1160                severity: Severity::Error,
1161                rule: Rule::InvalidDocumentMarker,
1162            });
1163        }
1164    }
1165
1166    /// Check that TAG tokens don't contain invalid characters
1167    ///
1168    /// YAML spec restricts which characters can appear in tags.
1169    /// Tags cannot contain: {, }, [, ], or comma (,)
1170    fn check_tag_characters(
1171        &self,
1172        token: &rowan::SyntaxToken<crate::Lang>,
1173        violations: &mut Vec<Violation>,
1174    ) {
1175        let tag_text = token.text();
1176
1177        // Check for invalid characters in tags
1178        let invalid_chars = ['{', '}', '[', ']', ','];
1179        for ch in invalid_chars {
1180            if tag_text.contains(ch) {
1181                violations.push(Violation {
1182                    message: format!("Invalid character '{}' in tag", ch),
1183                    location: None,
1184                    text_range: Some(range_to_text_position(token.text_range())),
1185                    severity: Severity::Error,
1186                    rule: Rule::InvalidTag,
1187                });
1188                return; // Only report once per tag
1189            }
1190        }
1191    }
1192
1193    /// Check that a TAG token is not immediately followed by a comma
1194    ///
1195    /// Per YAML spec, a tag must be followed by whitespace and then the tagged value.
1196    /// A comma immediately after a tag (without whitespace and value) is invalid.
1197    /// Example: `!!str, xxx` is invalid; should be `!!str xxx` or `!!str "xxx"`
1198    fn check_tag_followed_by_comma(
1199        &self,
1200        token: &rowan::SyntaxToken<crate::Lang>,
1201        violations: &mut Vec<Violation>,
1202    ) {
1203        // Look at the next sibling after the TAG token
1204        let mut current = token.next_sibling_or_token();
1205
1206        // Skip whitespace to find the next meaningful element
1207        while let Some(sibling) = current {
1208            match &sibling {
1209                rowan::NodeOrToken::Token(t) => {
1210                    match t.kind() {
1211                        crate::SyntaxKind::WHITESPACE | crate::SyntaxKind::NEWLINE => {
1212                            // Whitespace is expected, continue to next
1213                            current = t.next_sibling_or_token();
1214                            continue;
1215                        }
1216                        crate::SyntaxKind::COMMA => {
1217                            // Found a comma directly after the tag - this is invalid
1218                            violations.push(Violation {
1219                                message: "Invalid comma after tag".to_string(),
1220                                location: None,
1221                                text_range: Some(range_to_text_position(token.text_range())),
1222                                severity: Severity::Error,
1223                                rule: Rule::InvalidTag,
1224                            });
1225                            return;
1226                        }
1227                        _ => {
1228                            // Found some other token - that's fine
1229                            return;
1230                        }
1231                    }
1232                }
1233                rowan::NodeOrToken::Node(n) => {
1234                    // Found a node - check if it's a SCALAR that starts with a comma
1235                    if n.kind() == crate::SyntaxKind::SCALAR {
1236                        // Check if the first token in this scalar is a comma
1237                        for child in n.children_with_tokens() {
1238                            if let rowan::NodeOrToken::Token(t) = child {
1239                                if t.kind() == crate::SyntaxKind::COMMA {
1240                                    // The scalar starts with a comma - invalid after a tag
1241                                    violations.push(Violation {
1242                                        message: "Invalid comma after tag".to_string(),
1243                                        location: None,
1244                                        text_range: None,
1245                                        severity: Severity::Error,
1246                                        rule: Rule::InvalidTag,
1247                                    });
1248                                    return;
1249                                } else if t.kind() != crate::SyntaxKind::WHITESPACE
1250                                    && t.kind() != crate::SyntaxKind::NEWLINE
1251                                {
1252                                    // Found a non-comma, non-whitespace token - that's fine
1253                                    return;
1254                                }
1255                            }
1256                        }
1257                    }
1258                    // Other node types are fine
1259                    return;
1260                }
1261            }
1262        }
1263    }
1264
1265    /// Check that implicit keys don't span multiple lines
1266    ///
1267    /// YAML spec restricts implicit keys (keys without explicit ? marker) to a single line.
1268    /// This checks if a KEY node in a MAPPING_ENTRY contains newline characters.
1269    fn check_implicit_key_multiline(
1270        &self,
1271        entry_node: &SyntaxNode,
1272        violations: &mut Vec<Violation>,
1273    ) {
1274        // Find the KEY node within the MAPPING_ENTRY
1275        for child in entry_node.children() {
1276            if child.kind() == crate::SyntaxKind::KEY {
1277                // Check if the key's text contains a newline
1278                let key_text = child.text().to_string();
1279                if key_text.contains('\n') {
1280                    violations.push(Violation {
1281                        message: "Implicit key cannot span multiple lines".to_string(),
1282                        location: None,
1283                        text_range: Some(range_to_text_position(child.text_range())),
1284                        severity: Severity::Error,
1285                        rule: Rule::Other,
1286                    });
1287                    return; // Only report once per entry
1288                }
1289            }
1290        }
1291    }
1292
1293    /// Check for block sequence starting on same line as mapping key
1294    ///
1295    /// YAML 1.2 spec section 6.3.1 requires block sequences to start on a new line
1296    /// after the mapping key and colon. Example of invalid YAML:
1297    /// ```yaml
1298    /// key: - a
1299    ///      - b
1300    /// ```
1301    fn check_sequence_on_same_line_as_key(
1302        &self,
1303        entry_node: &SyntaxNode,
1304        violations: &mut Vec<Violation>,
1305    ) {
1306        use crate::SyntaxKind;
1307
1308        // Find the KEY and VALUE nodes within the MAPPING_ENTRY
1309        let mut key_node: Option<SyntaxNode> = None;
1310        let mut value_node: Option<SyntaxNode> = None;
1311
1312        for child in entry_node.children() {
1313            match child.kind() {
1314                SyntaxKind::KEY => key_node = Some(child),
1315                SyntaxKind::VALUE => value_node = Some(child),
1316                _ => {}
1317            }
1318        }
1319
1320        // If there's no value, nothing to check
1321        let Some(value) = value_node else { return };
1322
1323        // Check if the value is a block sequence
1324        let mut sequence_node: Option<SyntaxNode> = None;
1325        for child in value.children() {
1326            if child.kind() == SyntaxKind::SEQUENCE {
1327                sequence_node = Some(child);
1328                break;
1329            }
1330        }
1331
1332        let Some(sequence) = sequence_node else {
1333            return;
1334        };
1335
1336        // Check if this is a block sequence (not flow)
1337        let first_token = sequence.first_token();
1338        let is_flow_sequence = first_token.as_ref().is_some_and(|t| t.text() == "[");
1339
1340        if is_flow_sequence {
1341            return; // Flow sequences can be on same line
1342        }
1343
1344        // Check for a NEWLINE between key (or COLON) and the sequence
1345
1346        // Find the COLON token that separates key and value
1347        let mut found_colon = false;
1348        let mut has_newline = false;
1349
1350        if let Some(key) = key_node {
1351            // Start from after the key
1352            let mut current = key.next_sibling_or_token();
1353
1354            while let Some(element) = current {
1355                if let rowan::NodeOrToken::Token(t) = &element {
1356                    if t.kind() == SyntaxKind::COLON {
1357                        found_colon = true;
1358                    } else if found_colon && t.kind() == SyntaxKind::NEWLINE {
1359                        has_newline = true;
1360                        break;
1361                    }
1362                }
1363
1364                // Stop if we reach the sequence node
1365                if let rowan::NodeOrToken::Node(n) = &element {
1366                    if n == &sequence {
1367                        break;
1368                    }
1369                }
1370
1371                current = element.next_sibling_or_token();
1372            }
1373        }
1374
1375        // If there's no newline between the colon and the sequence, it's invalid
1376        if !has_newline {
1377            violations.push(Violation {
1378                message: "Block sequence cannot start on same line as mapping key".to_string(),
1379                location: None,
1380                text_range: None,
1381                severity: Severity::Error,
1382                rule: Rule::Other,
1383            });
1384        }
1385    }
1386
1387    /// Helper to calculate column position from text offset
1388    fn get_column(&self, text: &str, offset: usize) -> usize {
1389        let mut col = 0;
1390        for (i, ch) in text.char_indices() {
1391            if i >= offset {
1392                break;
1393            }
1394            if ch == '\n' {
1395                col = 0;
1396            } else {
1397                col += 1;
1398            }
1399        }
1400        col
1401    }
1402
1403    /// Check sequence items have consistent indentation (ZVH3)
1404    fn check_sequence_indentation(&self, node: &SyntaxNode, violations: &mut Vec<Violation>) {
1405        use crate::SyntaxKind;
1406
1407        // Only check SEQUENCE nodes
1408        if node.kind() != SyntaxKind::SEQUENCE {
1409            return;
1410        }
1411
1412        // Get the root text for offset calculations
1413        let root = find_root(node);
1414        let full_text = root.text().to_string();
1415        let mut dash_columns: Vec<usize> = Vec::new();
1416
1417        // Recursively collect all DASH tokens in this sequence and nested sequences
1418        fn collect_dashes(
1419            node: &rowan::SyntaxNode<crate::Lang>,
1420            dashes: &mut Vec<rowan::SyntaxToken<crate::Lang>>,
1421        ) {
1422            for child in node.children_with_tokens() {
1423                match child {
1424                    rowan::NodeOrToken::Token(token) if token.kind() == crate::SyntaxKind::DASH => {
1425                        dashes.push(token);
1426                    }
1427                    rowan::NodeOrToken::Node(n)
1428                        if n.kind() == crate::SyntaxKind::SEQUENCE_ENTRY =>
1429                    {
1430                        // Collect dashes from sequence entries
1431                        collect_dashes(&n, dashes);
1432                    }
1433                    _ => {}
1434                }
1435            }
1436        }
1437
1438        let mut dashes = Vec::new();
1439        collect_dashes(node, &mut dashes);
1440
1441        for token in dashes {
1442            let offset: usize = token.text_range().start().into();
1443            let col = self.get_column(&full_text, offset);
1444            dash_columns.push(col);
1445        }
1446
1447        // Check if all dashes are at the same column (consistent indentation)
1448        if let Some(&first_col) = dash_columns.first() {
1449            for &col in &dash_columns[1..] {
1450                if col != first_col {
1451                    violations.push(Violation {
1452                        message: "Inconsistent sequence item indentation".to_string(),
1453                        location: None,
1454                        text_range: None,
1455                        severity: Severity::Error,
1456                        rule: Rule::InvalidIndentation,
1457                    });
1458                    return; // Only report once
1459                }
1460            }
1461        }
1462    }
1463
1464    /// Check multiline quoted strings have proper indentation (QB6E)
1465    fn check_quoted_string_indentation(&self, node: &SyntaxNode, violations: &mut Vec<Violation>) {
1466        use crate::SyntaxKind;
1467
1468        // Only check SCALAR nodes
1469        if node.kind() != SyntaxKind::SCALAR {
1470            return;
1471        }
1472
1473        // Check if this is a quoted string that spans multiple lines
1474        let text = node.text().to_string();
1475        if !text.starts_with('"') && !text.starts_with('\'') {
1476            return; // Not a quoted string
1477        }
1478
1479        if !text.contains('\n') {
1480            return; // Single line, no indentation to check
1481        }
1482
1483        // For multiline quoted strings, continuation lines should be indented
1484
1485        // Check each line after the first
1486        let lines: Vec<&str> = text.split('\n').collect();
1487        if lines.len() > 1 {
1488            // Continuation lines (between opening and closing quote) should have consistent indentation
1489            // In YAML, they should be indented at least as much as the opening line
1490            for (i, line) in lines.iter().enumerate().skip(1) {
1491                if i == lines.len() - 1 && line.trim().is_empty() {
1492                    // Last line might just be the closing quote
1493                    continue;
1494                }
1495
1496                // Count leading spaces
1497                let leading_spaces = line.len() - line.trim_start().len();
1498
1499                // Continuation lines starting at column 0 are invalid
1500                // (they should be indented at least to align with content)
1501                if leading_spaces == 0 && !line.trim().is_empty() {
1502                    violations.push(Violation {
1503                        message: "Wrong indented multiline quoted scalar".to_string(),
1504                        location: None,
1505                        text_range: None,
1506                        severity: Severity::Error,
1507                        rule: Rule::InvalidIndentation,
1508                    });
1509                    return;
1510                }
1511            }
1512        }
1513    }
1514
1515    /// Check for duplicate keys within a mapping node
1516    ///
1517    /// Uses semantic comparison via `yaml_eq()`:
1518    /// - `true` and `True` are duplicates (same boolean value)
1519    /// - `1` and `0x1` are duplicates (same integer value)
1520    /// - `"1"` and `1` are NOT duplicates (different types: string vs int)
1521    /// - `null`, `~`, and empty key are all duplicates (all null)
1522    /// - Works with complex keys (sequences, mappings) as well
1523    fn check_duplicate_keys(&self, node: &SyntaxNode, violations: &mut Vec<Violation>) {
1524        use crate::yaml_eq;
1525        use crate::SyntaxKind;
1526
1527        // Collect all KEY nodes with their text representation and parent entry range
1528        let keys: Vec<(SyntaxNode, String, rowan::TextRange)> = node
1529            .children()
1530            .filter(|child| child.kind() == SyntaxKind::MAPPING_ENTRY)
1531            .filter_map(|child| {
1532                let entry_range = child.text_range();
1533                child
1534                    .children()
1535                    .find(|n| n.kind() == SyntaxKind::KEY)
1536                    .map(|key_node| {
1537                        // Only allocate once: trim() returns &str, then to_string() once
1538                        let key_text = key_node.text().to_string();
1539                        let key_text = key_text.trim().to_string();
1540                        (key_node, key_text, entry_range)
1541                    })
1542            })
1543            .collect();
1544
1545        // Check for semantic duplicates using yaml_eq
1546        // O(n²) is acceptable for typical YAML mapping sizes (usually < 100 keys)
1547        for i in 0..keys.len() {
1548            for j in (i + 1)..keys.len() {
1549                // Get the actual value nodes within each KEY and try to cast to AsYaml types
1550                let key1_child = keys[i].0.children().next();
1551                let key2_child = keys[j].0.children().next();
1552
1553                if let (Some(v1), Some(v2)) = (key1_child, key2_child) {
1554                    // Try each possible node type that implements AsYaml
1555                    use crate::nodes::{Mapping, Scalar, Sequence};
1556
1557                    let are_equal = match (v1.kind(), v2.kind()) {
1558                        (SyntaxKind::SCALAR, SyntaxKind::SCALAR) => Scalar::cast(v1)
1559                            .zip(Scalar::cast(v2))
1560                            .is_some_and(|(s1, s2)| yaml_eq(&s1, &s2)),
1561                        (SyntaxKind::SEQUENCE, SyntaxKind::SEQUENCE) => Sequence::cast(v1)
1562                            .zip(Sequence::cast(v2))
1563                            .is_some_and(|(s1, s2)| yaml_eq(&s1, &s2)),
1564                        (SyntaxKind::MAPPING, SyntaxKind::MAPPING) => Mapping::cast(v1)
1565                            .zip(Mapping::cast(v2))
1566                            .is_some_and(|(m1, m2)| yaml_eq(&m1, &m2)),
1567                        _ => false, // Different types can't be equal
1568                    };
1569
1570                    if are_equal {
1571                        let first_text = &keys[i].1;
1572                        let dup_text = &keys[j].1;
1573
1574                        // Format the key text for display (quote empty strings)
1575                        let format_key = |s: &str| {
1576                            if s.is_empty() {
1577                                "\"\"".to_string()
1578                            } else {
1579                                format!("{:?}", s)
1580                            }
1581                        };
1582
1583                        violations.push(Violation {
1584                            message: format!(
1585                                "Duplicate key: {} (semantically equal to {})",
1586                                format_key(dup_text),
1587                                format_key(first_text)
1588                            ),
1589                            location: None,
1590                            text_range: Some(range_to_text_position(keys[j].2)),
1591                            severity: Severity::Error,
1592                            rule: Rule::DuplicateKeys,
1593                        });
1594                        // Only report each duplicate once
1595                        break;
1596                    }
1597                }
1598            }
1599        }
1600    }
1601}
1602
1603impl Default for Validator {
1604    fn default() -> Self {
1605        Self::new()
1606    }
1607}
1608
1609#[cfg(test)]
1610mod tests {
1611    use super::*;
1612    use std::str::FromStr;
1613
1614    #[test]
1615    fn test_validator_basic() {
1616        let doc = Document::from_str("key: value").unwrap();
1617        let validator = Validator::new();
1618        let violations = validator.validate(&doc);
1619
1620        // Simple valid YAML should have no violations
1621        assert_eq!(violations.len(), 0);
1622    }
1623
1624    #[test]
1625    fn test_validator_tabs_debug() {
1626        let yaml = "---\na:\n\tb:\n\t\tc: value";
1627        let doc = Document::from_str(yaml).unwrap();
1628        let validator = Validator::new();
1629
1630        // Walk the tree and check for tabs
1631        let mut found_tab = false;
1632        for child in doc.syntax().descendants_with_tokens() {
1633            if let rowan::NodeOrToken::Token(token) = child {
1634                if token.text().contains('\t') {
1635                    println!(
1636                        "Found tab in token: {:?} = {:?}",
1637                        token.kind(),
1638                        token.text()
1639                    );
1640                    found_tab = true;
1641                }
1642            }
1643        }
1644
1645        println!("Found tab in tree: {}", found_tab);
1646
1647        let violations = validator.validate(&doc);
1648        println!("Violations: {}", violations.len());
1649        for v in &violations {
1650            println!("  {}", v);
1651        }
1652
1653        assert!(found_tab, "Tabs should be in the syntax tree");
1654    }
1655
1656    #[test]
1657    fn test_validator_missing_comma() {
1658        let doc = Document::from_str("{foo: 1 bar: 2}").unwrap();
1659        let validator = Validator::new();
1660        let violations = validator.validate(&doc);
1661
1662        // Should detect missing comma in flow mapping
1663        println!("Found {} violations:", violations.len());
1664        for v in &violations {
1665            println!("  {}", v);
1666        }
1667
1668        assert!(
1669            !violations.is_empty(),
1670            "Expected violations for missing comma, got none"
1671        );
1672    }
1673
1674    #[test]
1675    fn test_validator_invalid_escape() {
1676        let doc = Document::from_str("\"\\.\"\n").unwrap();
1677        let validator = Validator::new();
1678        let violations = validator.validate(&doc);
1679
1680        // Should detect invalid escape sequence
1681        assert!(
1682            !violations.is_empty(),
1683            "Expected violations for invalid escape \\., got none"
1684        );
1685        assert_eq!(violations[0].rule, Rule::InvalidEscape);
1686    }
1687
1688    #[test]
1689    fn test_validator_multiple_anchors() {
1690        // Test simple case
1691        let doc = Document::from_str("&a &b key: value").unwrap();
1692        let validator = Validator::new();
1693        let violations = validator.validate(&doc);
1694
1695        assert!(
1696            !violations.is_empty(),
1697            "Expected violations for multiple anchors, got none"
1698        );
1699        assert_eq!(violations[0].rule, Rule::InvalidAnchor);
1700
1701        // Test 4JVG case
1702        let yaml = "top1: &node1\n  &k1 key1: val1\ntop2: &node2\n  &v2 val2\n";
1703        let doc2 = Document::from_str(yaml).unwrap();
1704        let violations2 = validator.validate(&doc2);
1705
1706        // Should detect 2 violations (one for each VALUE node with 2 anchors)
1707        assert!(
1708            violations2.len() >= 2,
1709            "Expected at least 2 violations for 4JVG"
1710        );
1711    }
1712
1713    #[test]
1714    fn test_validator_duplicate_directive() {
1715        let yaml = "%YAML 1.2\n%YAML 1.2\n---\nkey: value\n";
1716        let doc = Document::from_str(yaml).unwrap();
1717        let validator = Validator::new();
1718        let violations = validator.validate(&doc);
1719
1720        assert_eq!(
1721            violations.len(),
1722            1,
1723            "Expected exactly one violation for duplicate YAML directive"
1724        );
1725        assert_eq!(violations[0].message, "Duplicate %YAML directive");
1726    }
1727
1728    #[test]
1729    fn test_validator_duplicate_keys() {
1730        let yaml = "a: 1\nb: 2\na: 3\n";
1731        let doc = Document::from_str(yaml).unwrap();
1732        let validator = Validator::new();
1733        let violations = validator.validate(&doc);
1734
1735        let dup_violations: Vec<_> = violations
1736            .iter()
1737            .filter(|v| v.rule == Rule::DuplicateKeys)
1738            .collect();
1739        assert_eq!(
1740            dup_violations.len(),
1741            1,
1742            "Expected exactly one DuplicateKeys violation, got: {:?}",
1743            dup_violations
1744        );
1745        assert_eq!(
1746            dup_violations[0].message,
1747            "Duplicate key: \"a\" (semantically equal to \"a\")"
1748        );
1749    }
1750
1751    #[test]
1752    fn test_validator_no_duplicate_keys() {
1753        let yaml = "a: 1\nb: 2\nc: 3\n";
1754        let doc = Document::from_str(yaml).unwrap();
1755        let validator = Validator::new();
1756        let violations = validator.validate(&doc);
1757
1758        let dup_violations: Vec<_> = violations
1759            .iter()
1760            .filter(|v| v.rule == Rule::DuplicateKeys)
1761            .collect();
1762        assert_eq!(
1763            dup_violations.len(),
1764            0,
1765            "Expected no DuplicateKeys violations"
1766        );
1767    }
1768
1769    #[test]
1770    fn test_validator_duplicate_keys_disabled() {
1771        let yaml = "a: 1\nb: 2\na: 3\n";
1772        let doc = Document::from_str(yaml).unwrap();
1773        let validator = Validator::with_config(ValidatorConfig {
1774            check_duplicate_keys: false,
1775            ..ValidatorConfig::default()
1776        });
1777        let violations = validator.validate(&doc);
1778
1779        let dup_violations: Vec<_> = violations
1780            .iter()
1781            .filter(|v| v.rule == Rule::DuplicateKeys)
1782            .collect();
1783        assert_eq!(
1784            dup_violations.len(),
1785            0,
1786            "Expected no violations when duplicate key check is disabled"
1787        );
1788    }
1789
1790    #[test]
1791    fn test_validator_semantic_duplicate_keys() {
1792        let validator = Validator::new();
1793
1794        // Test 1: Different quote styles - should be duplicates
1795        let yaml1 = "'a': 1\na: 2";
1796        let doc1 = Document::from_str(yaml1).unwrap();
1797        let violations1 = validator.validate(&doc1);
1798        assert_eq!(
1799            violations1
1800                .iter()
1801                .filter(|v| v.rule == Rule::DuplicateKeys)
1802                .count(),
1803            1,
1804            "Quoted 'a' and unquoted a should be duplicates"
1805        );
1806
1807        // Test 2: Different boolean representations - should be duplicates
1808        let yaml2 = "true: 1\nTrue: 2";
1809        let doc2 = Document::from_str(yaml2).unwrap();
1810        let violations2 = validator.validate(&doc2);
1811        assert_eq!(
1812            violations2
1813                .iter()
1814                .filter(|v| v.rule == Rule::DuplicateKeys)
1815                .count(),
1816            1,
1817            "true and True should be duplicates"
1818        );
1819
1820        // Test 3: Different integer representations - should be duplicates
1821        let yaml3 = "1: one\n0x1: hex";
1822        let doc3 = Document::from_str(yaml3).unwrap();
1823        let violations3 = validator.validate(&doc3);
1824        assert_eq!(
1825            violations3
1826                .iter()
1827                .filter(|v| v.rule == Rule::DuplicateKeys)
1828                .count(),
1829            1,
1830            "1 and 0x1 should be duplicates"
1831        );
1832
1833        // Test 4: Different null representations - should be duplicates
1834        let yaml4 = "null: 1\n~: 2";
1835        let doc4 = Document::from_str(yaml4).unwrap();
1836        let violations4 = validator.validate(&doc4);
1837        assert_eq!(
1838            violations4
1839                .iter()
1840                .filter(|v| v.rule == Rule::DuplicateKeys)
1841                .count(),
1842            1,
1843            "null and ~ should be duplicates"
1844        );
1845
1846        // Test 5: String vs int - should NOT be duplicates (different types)
1847        let yaml5 = "\"1\": string\n1: int";
1848        let doc5 = Document::from_str(yaml5).unwrap();
1849        let violations5 = validator.validate(&doc5);
1850        assert_eq!(
1851            violations5
1852                .iter()
1853                .filter(|v| v.rule == Rule::DuplicateKeys)
1854                .count(),
1855            0,
1856            "String '1' and int 1 should not be duplicates"
1857        );
1858
1859        // Test 6: Float vs int - should NOT be duplicates (different types)
1860        let yaml6 = "1.0: float\n1: int";
1861        let doc6 = Document::from_str(yaml6).unwrap();
1862        let violations6 = validator.validate(&doc6);
1863        assert_eq!(
1864            violations6
1865                .iter()
1866                .filter(|v| v.rule == Rule::DuplicateKeys)
1867                .count(),
1868            0,
1869            "Float 1.0 and int 1 should not be duplicates"
1870        );
1871    }
1872
1873    #[test]
1874    fn test_validator_directive_without_document() {
1875        // Test 9MMA: Directive without any document
1876        let yaml = "%YAML 1.2\n";
1877        let doc = Document::from_str(yaml).unwrap();
1878
1879        // Debug: check if directive exists in tree
1880        let root = doc
1881            .syntax()
1882            .parent()
1883            .unwrap_or_else(|| doc.syntax().clone());
1884        let directive_count = root
1885            .descendants()
1886            .filter(|n| n.kind() == crate::SyntaxKind::DIRECTIVE)
1887            .count();
1888        let content_count = doc
1889            .syntax()
1890            .descendants()
1891            .filter(|n| {
1892                matches!(
1893                    n.kind(),
1894                    crate::SyntaxKind::MAPPING
1895                        | crate::SyntaxKind::SEQUENCE
1896                        | crate::SyntaxKind::SCALAR
1897                        | crate::SyntaxKind::TAGGED_NODE
1898                )
1899            })
1900            .count();
1901
1902        let validator = Validator::new();
1903        let violations = validator.validate(&doc);
1904
1905        // Only check if directives are actually in the tree
1906        if directive_count > 0 && content_count == 0 {
1907            assert!(
1908                !violations.is_empty(),
1909                "Expected violation for directive without document (directives={}, content={})",
1910                directive_count,
1911                content_count
1912            );
1913        }
1914    }
1915
1916    #[test]
1917    fn test_validator_content_after_doc_end() {
1918        // Test 3HFZ: Content after document end marker
1919        // Parser wraps this in ERROR node, validator detects it
1920        let yaml = "---\nkey: value\n... invalid\n";
1921        let doc = Document::from_str(yaml).unwrap();
1922
1923        let validator = Validator::new();
1924        let violations = validator.validate(&doc);
1925
1926        let invalid_content_violations: Vec<_> = violations
1927            .iter()
1928            .filter(|v| v.message.starts_with("Invalid content in document:"))
1929            .collect();
1930        assert_eq!(
1931            invalid_content_violations.len(),
1932            1,
1933            "Expected exactly one 'Invalid content' violation for content after document end marker"
1934        );
1935    }
1936
1937    #[test]
1938    fn test_validator_directive_with_tagged_node_content() {
1939        // A document with a tagged scalar following a directive should NOT be
1940        // reported as "directive without content" — TAGGED_NODE is real content.
1941        let yaml = "%YAML 1.2\n---\n!custom foo\n";
1942        let doc = Document::from_str(yaml).unwrap();
1943        let validator = Validator::new();
1944        let violations = validator.validate(&doc);
1945
1946        assert_eq!(
1947            violations.len(),
1948            0,
1949            "Tagged scalar is real content; valid document should have no violations"
1950        );
1951    }
1952
1953    #[test]
1954    fn test_validator_with_config() {
1955        let config = ValidatorConfig {
1956            check_duplicate_keys: false,
1957            ..Default::default()
1958        };
1959        let validator = Validator::with_config(config);
1960
1961        let doc = Document::from_str("key: value").unwrap();
1962        let violations = validator.validate(&doc);
1963
1964        assert_eq!(violations.len(), 0);
1965    }
1966
1967    #[test]
1968    fn test_violation_display() {
1969        let violation = Violation {
1970            message: "Test violation".to_string(),
1971            location: Some("1:5".to_string()),
1972            text_range: None,
1973            severity: Severity::Error,
1974            rule: Rule::InvalidIndentation,
1975        };
1976
1977        assert_eq!(
1978            format!("{}", violation),
1979            "[ERROR] 1:5: Test violation (InvalidIndentation)"
1980        );
1981    }
1982
1983    #[test]
1984    fn test_u99r_invalid_comma_in_tag() {
1985        // Test U99R: Invalid comma after tag
1986        let yaml = "- !!str, xxx\n";
1987        use crate::YamlFile;
1988        let file = YamlFile::from_str(yaml).unwrap();
1989        let validator = Validator::new();
1990
1991        // Print tree for debugging
1992        println!("\n=== Syntax tree ===");
1993        crate::debug::print_tree(file.syntax());
1994
1995        let violations = validator.validate_syntax(file.syntax());
1996        println!("\n=== Violations ({}) ===", violations.len());
1997        for v in &violations {
1998            println!("  {}", v);
1999        }
2000
2001        assert!(
2002            !violations.is_empty(),
2003            "Expected violation for invalid comma after tag"
2004        );
2005        assert_eq!(violations.len(), 1);
2006        assert_eq!(violations[0].message, "Invalid comma after tag");
2007        assert_eq!(violations[0].rule, Rule::InvalidTag);
2008    }
2009
2010    #[test]
2011    fn test_comment_whitespace() {
2012        use crate::YamlFile;
2013
2014        // Comment must be separated by whitespace
2015        let yaml = "key: \"value\"# invalid comment\n";
2016        let parsed = YamlFile::from_str(yaml).expect("Should parse");
2017
2018        let validator = Validator::new();
2019        let violations = validator.validate_syntax(parsed.syntax());
2020
2021        assert!(
2022            !violations.is_empty(),
2023            "Should catch comment without whitespace"
2024        );
2025        assert_eq!(
2026            violations[0].message,
2027            "Comment without whitespace separation"
2028        );
2029    }
2030
2031    #[test]
2032    fn test_doc_start_content() {
2033        use crate::YamlFile;
2034
2035        // Content should not appear on same line as document start marker
2036        let yaml = "--- key1: value1\n    key2: value2\n";
2037        let parsed = YamlFile::from_str(yaml).expect("Should parse");
2038
2039        let validator = Validator::new();
2040        let violations = validator.validate_syntax(parsed.syntax());
2041
2042        assert!(
2043            !violations.is_empty(),
2044            "Should catch content on doc start line"
2045        );
2046        assert_eq!(
2047            violations[0].message,
2048            "Content on same line as document start marker"
2049        );
2050    }
2051
2052    #[test]
2053    fn test_directive_in_document_content() {
2054        // %YAML directive after --- without preceding ... is invalid
2055        let input = "%YAML 1.2\n---\n%YAML 1.2\n---\n";
2056        let file = crate::YamlFile::from_str(input).unwrap();
2057        let validator = Validator::new();
2058        use rowan::ast::AstNode;
2059        let violations = validator.validate_syntax(file.syntax());
2060
2061        assert_eq!(
2062            violations.len(),
2063            1,
2064            "Expected one violation for directive in content, got: {:?}",
2065            violations
2066        );
2067        assert_eq!(
2068            violations[0].message,
2069            "Directive in document content (missing document end marker `...` before directive)"
2070        );
2071    }
2072}