Skip to main content

ralph_workflow/files/llm_output_extraction/
xsd_validation_issues.rs

1//! XSD validation for issues XML format.
2//!
3//! This module provides validation of XML output against the XSD schema
4//! to ensure AI agent output conforms to the expected format for review issues.
5//!
6//! Uses quick_xml for robust XML parsing with proper whitespace handling.
7
8use crate::files::llm_output_extraction::xml_helpers::{
9    create_reader, duplicate_element_error, format_content_preview, malformed_xml_error,
10    read_text_until_end, skip_to_end, text_outside_tags_error, unexpected_element_error,
11};
12use crate::files::llm_output_extraction::xsd_validation::{XsdErrorType, XsdValidationError};
13use quick_xml::events::Event;
14
15/// Example of valid issues XML with issues.
16const EXAMPLE_ISSUES_XML: &str = r#"<ralph-issues>
17<ralph-issue>Missing error handling in API endpoint</ralph-issue>
18<ralph-issue>Variable shadowing in loop construct</ralph-issue>
19</ralph-issues>"#;
20
21/// Example of valid issues XML with no issues.
22const EXAMPLE_NO_ISSUES_XML: &str = r#"<ralph-issues>
23<ralph-no-issues-found>No issues were found during review</ralph-no-issues-found>
24</ralph-issues>"#;
25
26/// Validate issues XML content against the issues XSD.
27///
28/// Accepts either `<ralph-issues><ralph-issue>...` items or a single
29/// `<ralph-no-issues-found>` entry.
30pub fn validate_issues_xml(xml_content: &str) -> Result<IssuesElements, XsdValidationError> {
31    let content = xml_content.trim();
32
33    // Check for illegal XML characters BEFORE parsing
34    // This provides clear error messages instead of cryptic parse errors
35    use crate::files::llm_output_extraction::xml_helpers::check_for_illegal_xml_characters;
36    check_for_illegal_xml_characters(content)?;
37
38    let mut reader = create_reader(content);
39    let mut buf = Vec::new();
40
41    // Find the root element
42    loop {
43        match reader.read_event_into(&mut buf) {
44            Ok(Event::Start(e)) if e.name().as_ref() == b"ralph-issues" => break,
45            Ok(Event::Start(e)) => {
46                let name_bytes = e.name();
47                let tag_name = String::from_utf8_lossy(name_bytes.as_ref());
48                return Err(XsdValidationError {
49                    error_type: XsdErrorType::MissingRequiredElement,
50                    element_path: "ralph-issues".to_string(),
51                    expected: "<ralph-issues> as root element".to_string(),
52                    found: format!("<{}> (wrong root element)", tag_name),
53                    suggestion: "Use <ralph-issues> as the root element.".to_string(),
54                    example: Some(EXAMPLE_ISSUES_XML.into()),
55                });
56            }
57            Ok(Event::Text(_)) => {
58                // Text before root element - continue to EOF error which is more informative
59            }
60            Ok(Event::Eof) => {
61                return Err(XsdValidationError {
62                    error_type: XsdErrorType::MissingRequiredElement,
63                    element_path: "ralph-issues".to_string(),
64                    expected: "<ralph-issues> as root element".to_string(),
65                    found: format_content_preview(content),
66                    suggestion: "Wrap your issues in <ralph-issues>...</ralph-issues> tags."
67                        .to_string(),
68                    example: Some(EXAMPLE_ISSUES_XML.into()),
69                });
70            }
71            Ok(_) => {} // Skip XML declaration, comments, etc.
72            Err(e) => return Err(malformed_xml_error(e)),
73        }
74        buf.clear();
75    }
76
77    // Parse child elements
78    let mut issues: Vec<String> = Vec::new();
79    let mut no_issues_found: Option<String> = None;
80
81    const VALID_TAGS: [&str; 2] = ["ralph-issue", "ralph-no-issues-found"];
82
83    loop {
84        buf.clear();
85        match reader.read_event_into(&mut buf) {
86            Ok(Event::Start(e)) => {
87                match e.name().as_ref() {
88                    b"ralph-issue" => {
89                        // Cannot mix issues and no-issues-found
90                        if no_issues_found.is_some() {
91                            return Err(XsdValidationError {
92                                error_type: XsdErrorType::UnexpectedElement,
93                                element_path: "ralph-issues/ralph-issue".to_string(),
94                                expected: "either <ralph-issue> elements OR <ralph-no-issues-found>, not both".to_string(),
95                                found: "mixed issues and no-issues-found".to_string(),
96                                suggestion: "Use <ralph-issue> when issues exist, or <ralph-no-issues-found> when no issues exist.".to_string(),
97                                example: Some(EXAMPLE_ISSUES_XML.into()),
98                            });
99                        }
100                        let issue_text = read_text_until_end(&mut reader, b"ralph-issue")?;
101                        issues.push(issue_text);
102                    }
103                    b"ralph-no-issues-found" => {
104                        // Cannot mix issues and no-issues-found
105                        if !issues.is_empty() {
106                            return Err(XsdValidationError {
107                                error_type: XsdErrorType::UnexpectedElement,
108                                element_path: "ralph-issues/ralph-no-issues-found".to_string(),
109                                expected: "either <ralph-issue> elements OR <ralph-no-issues-found>, not both".to_string(),
110                                found: "mixed issues and no-issues-found".to_string(),
111                                suggestion: "Use <ralph-issue> when issues exist, or <ralph-no-issues-found> when no issues exist.".to_string(),
112                                example: Some(EXAMPLE_NO_ISSUES_XML.into()),
113                            });
114                        }
115                        if no_issues_found.is_some() {
116                            return Err(duplicate_element_error(
117                                "ralph-no-issues-found",
118                                "ralph-issues",
119                            ));
120                        }
121                        no_issues_found =
122                            Some(read_text_until_end(&mut reader, b"ralph-no-issues-found")?);
123                    }
124                    other => {
125                        let _ = skip_to_end(&mut reader, other);
126                        return Err(unexpected_element_error(other, &VALID_TAGS, "ralph-issues"));
127                    }
128                }
129            }
130            Ok(Event::Text(e)) => {
131                let text = e.unescape().unwrap_or_default();
132                let trimmed = text.trim();
133                if !trimmed.is_empty() {
134                    return Err(text_outside_tags_error(trimmed, "ralph-issues"));
135                }
136            }
137            Ok(Event::End(e)) if e.name().as_ref() == b"ralph-issues" => break,
138            Ok(Event::Eof) => {
139                return Err(XsdValidationError {
140                    error_type: XsdErrorType::MalformedXml,
141                    element_path: "ralph-issues".to_string(),
142                    expected: "closing </ralph-issues> tag".to_string(),
143                    found: "end of content without closing tag".to_string(),
144                    suggestion: "Add </ralph-issues> at the end.".to_string(),
145                    example: Some(EXAMPLE_ISSUES_XML.into()),
146                });
147            }
148            Ok(_) => {} // Skip comments, etc.
149            Err(e) => return Err(malformed_xml_error(e)),
150        }
151    }
152
153    // Filter out empty issues
154    let filtered_issues: Vec<String> = issues.into_iter().filter(|s| !s.is_empty()).collect();
155    let filtered_no_issues = no_issues_found.filter(|s| !s.is_empty());
156
157    // Must have either issues or no-issues-found
158    if filtered_issues.is_empty() && filtered_no_issues.is_none() {
159        return Err(XsdValidationError {
160            error_type: XsdErrorType::MissingRequiredElement,
161            element_path: "ralph-issues".to_string(),
162            expected: "at least one <ralph-issue> element OR <ralph-no-issues-found>".to_string(),
163            found: "empty <ralph-issues> element".to_string(),
164            suggestion:
165                "Add <ralph-issue> elements for issues found, or <ralph-no-issues-found> if no issues exist."
166                    .to_string(),
167            example: Some(EXAMPLE_ISSUES_XML.into()),
168        });
169    }
170
171    Ok(IssuesElements {
172        issues: filtered_issues,
173        no_issues_found: filtered_no_issues,
174    })
175}
176
177/// Parsed issues elements from valid XML.
178#[derive(Debug, Clone, PartialEq, Eq)]
179pub struct IssuesElements {
180    /// List of issues (if any)
181    pub issues: Vec<String>,
182    /// No issues found message (if no issues)
183    pub no_issues_found: Option<String>,
184}
185
186impl IssuesElements {
187    /// Returns true if there are no issues.
188    #[cfg(any(test, feature = "test-utils"))]
189    pub fn is_empty(&self) -> bool {
190        self.issues.is_empty() && self.no_issues_found.is_some()
191    }
192
193    /// Returns the number of issues.
194    #[cfg(any(test, feature = "test-utils"))]
195    pub fn issue_count(&self) -> usize {
196        self.issues.len()
197    }
198}
199
200#[cfg(test)]
201mod tests {
202    use super::*;
203
204    #[test]
205    fn test_validate_valid_single_issue() {
206        let xml = r#"<ralph-issues>
207<ralph-issue>First issue description</ralph-issue>
208</ralph-issues>"#;
209
210        let result = validate_issues_xml(xml);
211        assert!(result.is_ok());
212        let elements = result.unwrap();
213        assert_eq!(elements.issues.len(), 1);
214        assert_eq!(elements.issues[0], "First issue description");
215        assert!(elements.no_issues_found.is_none());
216    }
217
218    #[test]
219    fn test_validate_valid_multiple_issues() {
220        let xml = r#"<ralph-issues>
221<ralph-issue>First issue</ralph-issue>
222<ralph-issue>Second issue</ralph-issue>
223<ralph-issue>Third issue</ralph-issue>
224</ralph-issues>"#;
225
226        let result = validate_issues_xml(xml);
227        assert!(result.is_ok());
228        let elements = result.unwrap();
229        assert_eq!(elements.issues.len(), 3);
230        assert_eq!(elements.issue_count(), 3);
231    }
232
233    #[test]
234    fn test_validate_valid_no_issues_found() {
235        let xml = r#"<ralph-issues><ralph-no-issues-found>No issues were found during review</ralph-no-issues-found></ralph-issues>"#;
236
237        let result = validate_issues_xml(xml);
238        assert!(result.is_ok());
239        let elements = result.unwrap();
240        assert!(elements.issues.is_empty());
241        assert!(elements.no_issues_found.is_some());
242        assert!(elements.is_empty());
243    }
244
245    #[test]
246    fn test_validate_missing_root_element() {
247        let xml = r#"Some random text without proper XML tags"#;
248
249        let result = validate_issues_xml(xml);
250        assert!(result.is_err());
251        let error = result.unwrap_err();
252        assert_eq!(error.element_path, "ralph-issues");
253    }
254
255    #[test]
256    fn test_validate_empty_issues() {
257        let xml = r#"<ralph-issues></ralph-issues>"#;
258
259        let result = validate_issues_xml(xml);
260        assert!(result.is_err());
261        let error = result.unwrap_err();
262        assert!(error.expected.contains("at least one"));
263    }
264
265    #[test]
266    fn test_validate_mixed_issues_and_no_issues_found() {
267        let xml = r#"<ralph-issues><ralph-issue>First issue</ralph-issue><ralph-no-issues-found>No issues</ralph-no-issues-found></ralph-issues>"#;
268
269        let result = validate_issues_xml(xml);
270        assert!(result.is_err());
271        let error = result.unwrap_err();
272        assert!(error.suggestion.contains("not both") || error.expected.contains("not both"));
273    }
274
275    #[test]
276    fn test_validate_duplicate_no_issues_found() {
277        let xml = r#"<ralph-issues><ralph-no-issues-found>No issues</ralph-no-issues-found><ralph-no-issues-found>Also no issues</ralph-no-issues-found></ralph-issues>"#;
278
279        let result = validate_issues_xml(xml);
280        assert!(result.is_err());
281    }
282
283    #[test]
284    fn test_validate_whitespace_handling() {
285        // This is the key test - quick_xml should handle whitespace between elements
286        let xml =
287            "  <ralph-issues>  \n  <ralph-issue>Issue text</ralph-issue>  \n  </ralph-issues>  ";
288
289        let result = validate_issues_xml(xml);
290        assert!(result.is_ok());
291    }
292
293    #[test]
294    fn test_validate_with_xml_declaration() {
295        let xml = r#"<?xml version="1.0"?><ralph-issues><ralph-issue>Issue text</ralph-issue></ralph-issues>"#;
296
297        let result = validate_issues_xml(xml);
298        assert!(result.is_ok());
299    }
300
301    #[test]
302    fn test_validate_issue_with_code_element() {
303        // XSD now allows <code> elements for escaping special characters
304        let xml = r#"<ralph-issues><ralph-issue>Check if <code>a &lt; b</code> is valid</ralph-issue></ralph-issues>"#;
305
306        let result = validate_issues_xml(xml);
307        assert!(result.is_ok());
308        let elements = result.unwrap();
309        assert_eq!(elements.issues.len(), 1);
310        // The text from both outside and inside <code> should be collected
311        assert!(elements.issues[0].contains("Check if"));
312        assert!(elements.issues[0].contains("a < b"));
313        assert!(elements.issues[0].contains("is valid"));
314    }
315
316    #[test]
317    fn test_validate_no_issues_with_code_element() {
318        let xml = r#"<ralph-issues><ralph-no-issues-found>All <code>Record&lt;string, T&gt;</code> types are correct</ralph-no-issues-found></ralph-issues>"#;
319
320        let result = validate_issues_xml(xml);
321        assert!(result.is_ok());
322        let elements = result.unwrap();
323        assert!(elements.no_issues_found.is_some());
324        let msg = elements.no_issues_found.unwrap();
325        assert!(msg.contains("Record<string, T>"));
326    }
327
328    // =========================================================================
329    // REALISTIC LLM OUTPUT TESTS
330    // These test actual patterns that LLMs produce when following the prompts
331    // =========================================================================
332
333    #[test]
334    fn test_llm_realistic_issue_with_generic_type_escaped() {
335        // LLM correctly escapes generic types per prompt instructions
336        let xml = r#"<ralph-issues>
337<ralph-issue>[High] src/parser.rs:42 - The function <code>parse&lt;T&gt;</code> does not handle empty input.
338Suggested fix: Add a check for empty input before parsing.</ralph-issue>
339</ralph-issues>"#;
340
341        let result = validate_issues_xml(xml);
342        assert!(result.is_ok(), "Should parse escaped generic: {:?}", result);
343        let elements = result.unwrap();
344        assert!(elements.issues[0].contains("parse<T>"));
345    }
346
347    #[test]
348    fn test_llm_realistic_issue_with_comparison_escaped() {
349        // LLM correctly escapes comparison operators
350        let xml = r#"<ralph-issues>
351<ralph-issue>[Medium] src/validate.rs:15 - The condition <code>count &lt; 0</code> should be <code>count &lt;= 0</code>.
352Suggested fix: Change the comparison operator.</ralph-issue>
353</ralph-issues>"#;
354
355        let result = validate_issues_xml(xml);
356        assert!(
357            result.is_ok(),
358            "Should parse escaped comparisons: {:?}",
359            result
360        );
361        let elements = result.unwrap();
362        assert!(elements.issues[0].contains("count < 0"));
363        assert!(elements.issues[0].contains("count <= 0"));
364    }
365
366    #[test]
367    fn test_llm_realistic_issue_with_logical_operators_escaped() {
368        // LLM escapes && and || operators
369        let xml = r#"<ralph-issues><ralph-issue>[Low] src/filter.rs:88 - The expression <code>a &amp;&amp; b || c</code> has ambiguous precedence.
370Suggested fix: Add explicit parentheses.</ralph-issue></ralph-issues>"#;
371
372        let result = validate_issues_xml(xml);
373        assert!(
374            result.is_ok(),
375            "Should parse escaped logical operators: {:?}",
376            result
377        );
378        let elements = result.unwrap();
379        assert!(elements.issues[0].contains("a && b || c"));
380    }
381
382    #[test]
383    fn test_llm_realistic_issue_with_rust_lifetime() {
384        // LLM references Rust lifetime syntax
385        let xml = r#"<ralph-issues><ralph-issue>[High] src/buffer.rs:23 - The lifetime <code>&amp;'a str</code> should match the struct lifetime.
386Suggested fix: Ensure lifetime annotations are consistent.</ralph-issue></ralph-issues>"#;
387
388        let result = validate_issues_xml(xml);
389        assert!(result.is_ok(), "Should parse lifetime syntax: {:?}", result);
390        let elements = result.unwrap();
391        assert!(elements.issues[0].contains("&'a str"));
392    }
393
394    #[test]
395    fn test_llm_realistic_issue_with_html_in_description() {
396        // LLM describes HTML-related code
397        let xml = r#"<ralph-issues><ralph-issue>[Medium] src/template.rs:56 - The HTML template uses <code>&lt;div class="container"&gt;</code> but should use semantic tags.
398Suggested fix: Replace with appropriate semantic HTML elements.</ralph-issue></ralph-issues>"#;
399
400        let result = validate_issues_xml(xml);
401        assert!(result.is_ok(), "Should parse HTML in code: {:?}", result);
402        let elements = result.unwrap();
403        assert!(elements.issues[0].contains("<div class=\"container\">"));
404    }
405
406    #[test]
407    fn test_llm_realistic_no_issues_with_detailed_explanation() {
408        // LLM provides detailed explanation when no issues found
409        let xml = "<ralph-issues><ralph-no-issues-found>The implementation correctly handles all edge cases:\n- Input validation properly rejects values where <code>x &lt; 0</code>\n- The generic <code>Result&lt;T, E&gt;</code> type is used consistently\n- Error handling follows the project's established patterns\nNo issues require attention.</ralph-no-issues-found></ralph-issues>";
410
411        let result = validate_issues_xml(xml);
412        assert!(
413            result.is_ok(),
414            "Should parse detailed no-issues: {:?}",
415            result
416        );
417        let elements = result.unwrap();
418        let msg = elements.no_issues_found.unwrap();
419        assert!(msg.contains("x < 0"));
420        assert!(msg.contains("Result<T, E>"));
421    }
422
423    #[test]
424    fn test_llm_realistic_multiple_issues_with_mixed_content() {
425        // LLM reports multiple issues with various escaped content
426        let xml = r#"<ralph-issues><ralph-issue>[Critical] src/auth.rs:12 - SQL injection vulnerability: user input in <code>query &amp;&amp; filter</code> is not sanitized.</ralph-issue><ralph-issue>[High] src/api.rs:45 - Missing null check: <code>response.data</code> may be undefined when <code>status &lt; 200</code>.</ralph-issue><ralph-issue>[Medium] src/utils.rs:78 - The type <code>Option&lt;Vec&lt;T&gt;&gt;</code> could be simplified to <code>Vec&lt;T&gt;</code> with empty default.</ralph-issue></ralph-issues>"#;
427
428        let result = validate_issues_xml(xml);
429        assert!(
430            result.is_ok(),
431            "Should parse multiple issues with mixed content: {:?}",
432            result
433        );
434        let elements = result.unwrap();
435        assert_eq!(elements.issues.len(), 3);
436        assert!(elements.issues[0].contains("query && filter"));
437        assert!(elements.issues[1].contains("status < 200"));
438        assert!(elements.issues[2].contains("Option<Vec<T>>"));
439    }
440
441    #[test]
442    fn test_llm_mistake_unescaped_less_than_fails() {
443        // LLM forgets to escape < - this SHOULD fail
444        let xml = r#"<ralph-issues><ralph-issue>[High] src/compare.rs:10 - The condition a < b is wrong.</ralph-issue></ralph-issues>"#;
445
446        let result = validate_issues_xml(xml);
447        assert!(
448            result.is_err(),
449            "Unescaped < should fail XML parsing: {:?}",
450            result
451        );
452    }
453
454    #[test]
455    fn test_llm_mistake_unescaped_generic_fails() {
456        // LLM forgets to escape generic type - this SHOULD fail
457        let xml = r#"<ralph-issues><ralph-issue>[High] src/types.rs:5 - The type Vec<String> is incorrect.</ralph-issue></ralph-issues>"#;
458
459        let result = validate_issues_xml(xml);
460        assert!(
461            result.is_err(),
462            "Unescaped generic should fail XML parsing: {:?}",
463            result
464        );
465    }
466
467    #[test]
468    fn test_llm_mistake_unescaped_ampersand_fails() {
469        // LLM forgets to escape & - this SHOULD fail
470        let xml = r#"<ralph-issues><ralph-issue>[High] src/logic.rs:20 - The expression a && b is wrong.</ralph-issue></ralph-issues>"#;
471
472        let result = validate_issues_xml(xml);
473        assert!(
474            result.is_err(),
475            "Unescaped && should fail XML parsing: {:?}",
476            result
477        );
478    }
479
480    #[test]
481    fn test_llm_uses_cdata_for_code_content() {
482        // LLM uses CDATA instead of escaping (valid alternative)
483        let xml = r#"<ralph-issues><ralph-issue>[High] src/cmp.rs:10 - The condition <code><![CDATA[a < b && c > d]]></code> has issues.</ralph-issue></ralph-issues>"#;
484
485        let result = validate_issues_xml(xml);
486        assert!(result.is_ok(), "CDATA should be valid: {:?}", result);
487        let elements = result.unwrap();
488        assert!(elements.issues[0].contains("a < b && c > d"));
489    }
490
491    // =========================================================================
492    // REGRESSION TEST FOR BUG: NUL byte from NBSP typo
493    // =========================================================================
494
495    #[test]
496    fn test_validate_nul_byte_from_nbsp_typo() {
497        // Regression test for bug where agent writes \u0000 instead of \u00A0
498        // This simulates: .replace("git diff", "git\0A0diff")
499        // The bug report shows this exact pattern in `.agent/tmp/issues.xml.processed`
500        let xml =
501            "<ralph-issues><ralph-issue>Check git\u{0000}A0diff usage</ralph-issue></ralph-issues>";
502
503        let result = validate_issues_xml(xml);
504        assert!(result.is_err(), "NUL byte should be rejected");
505
506        let error = result.unwrap_err();
507        assert!(
508            error.found.contains("NUL") || error.found.contains("0x00"),
509            "Error should identify NUL byte, got: {}",
510            error.found
511        );
512        assert!(
513            error.suggestion.contains("\\u00A0") || error.suggestion.contains("non-breaking space"),
514            "Error should suggest NBSP as common fix, got: {}",
515            error.suggestion
516        );
517    }
518}