Skip to main content

ralph_workflow/files/llm_output_extraction/
xsd_validation_issues.rs

1//! XSD validation for issues XML format.
2//!
3//! This module provides validation of XML output against the XSD schema
4//! to ensure AI agent output conforms to the expected format for review issues.
5//!
6//! Uses quick_xml for robust XML parsing with proper whitespace handling.
7
8use crate::files::llm_output_extraction::xml_helpers::{
9    create_reader, duplicate_element_error, malformed_xml_error, read_text_until_end, skip_to_end,
10    text_outside_tags_error, unexpected_element_error,
11};
12use crate::files::llm_output_extraction::xsd_validation::{XsdErrorType, XsdValidationError};
13use quick_xml::events::Event;
14
15/// Example of valid issues XML with issues.
16const EXAMPLE_ISSUES_XML: &str = r#"<ralph-issues>
17<ralph-issue>Missing error handling in API endpoint</ralph-issue>
18<ralph-issue>Variable shadowing in loop construct</ralph-issue>
19</ralph-issues>"#;
20
21/// Example of valid issues XML with no issues.
22const EXAMPLE_NO_ISSUES_XML: &str = r#"<ralph-issues>
23<ralph-no-issues-found>No issues were found during review</ralph-no-issues-found>
24</ralph-issues>"#;
25
26/// Validate issues XML content against the XSD schema.
27///
28/// This function validates that the XML content conforms to the expected
29/// issues format defined in issues.xsd:
30///
31/// ```xml
32/// <ralph-issues>
33///   <ralph-issue>First issue description</ralph-issue>
34///   <ralph-issue>Second issue description</ralph-issue>
35///   ...
36/// </ralph-issues>
37/// ```
38///
39/// OR for no issues:
40///
41/// ```xml
42/// <ralph-issues>
43///   <ralph-no-issues-found>No issues were found during review</ralph-no-issues-found>
44/// </ralph-issues>
45/// ```
46///
47/// # Arguments
48///
49/// * `xml_content` - The XML content to validate
50///
51/// # Returns
52///
53/// * `Ok(IssuesElements)` if the XML is valid and contains all required elements
54/// * `Err(XsdValidationError)` if the XML is invalid or doesn't conform to the schema
55pub fn validate_issues_xml(xml_content: &str) -> Result<IssuesElements, XsdValidationError> {
56    let content = xml_content.trim();
57    let mut reader = create_reader(content);
58    let mut buf = Vec::new();
59
60    // Find the root element
61    loop {
62        match reader.read_event_into(&mut buf) {
63            Ok(Event::Start(e)) if e.name().as_ref() == b"ralph-issues" => break,
64            Ok(Event::Start(e)) => {
65                let name_bytes = e.name();
66                let tag_name = String::from_utf8_lossy(name_bytes.as_ref());
67                return Err(XsdValidationError {
68                    error_type: XsdErrorType::MissingRequiredElement,
69                    element_path: "ralph-issues".to_string(),
70                    expected: "<ralph-issues> as root element".to_string(),
71                    found: format!("<{}> (wrong root element)", tag_name),
72                    suggestion: "Use <ralph-issues> as the root element.".to_string(),
73                    example: Some(EXAMPLE_ISSUES_XML.into()),
74                });
75            }
76            Ok(Event::Text(_)) => {
77                // Text before root element - continue to EOF error which is more informative
78            }
79            Ok(Event::Eof) => {
80                return Err(XsdValidationError {
81                    error_type: XsdErrorType::MissingRequiredElement,
82                    element_path: "ralph-issues".to_string(),
83                    expected: "<ralph-issues> as root element".to_string(),
84                    found: if content.is_empty() {
85                        "empty content".to_string()
86                    } else if content.len() <= 60 {
87                        content.to_string()
88                    } else {
89                        format!("{}...", &content[..60])
90                    },
91                    suggestion: "Wrap your issues in <ralph-issues>...</ralph-issues> tags."
92                        .to_string(),
93                    example: Some(EXAMPLE_ISSUES_XML.into()),
94                });
95            }
96            Ok(_) => {} // Skip XML declaration, comments, etc.
97            Err(e) => return Err(malformed_xml_error(e)),
98        }
99        buf.clear();
100    }
101
102    // Parse child elements
103    let mut issues: Vec<String> = Vec::new();
104    let mut no_issues_found: Option<String> = None;
105
106    const VALID_TAGS: [&str; 2] = ["ralph-issue", "ralph-no-issues-found"];
107
108    loop {
109        buf.clear();
110        match reader.read_event_into(&mut buf) {
111            Ok(Event::Start(e)) => {
112                match e.name().as_ref() {
113                    b"ralph-issue" => {
114                        // Cannot mix issues and no-issues-found
115                        if no_issues_found.is_some() {
116                            return Err(XsdValidationError {
117                                error_type: XsdErrorType::UnexpectedElement,
118                                element_path: "ralph-issues/ralph-issue".to_string(),
119                                expected: "either <ralph-issue> elements OR <ralph-no-issues-found>, not both".to_string(),
120                                found: "mixed issues and no-issues-found".to_string(),
121                                suggestion: "Use <ralph-issue> when issues exist, or <ralph-no-issues-found> when no issues exist.".to_string(),
122                                example: Some(EXAMPLE_ISSUES_XML.into()),
123                            });
124                        }
125                        let issue_text = read_text_until_end(&mut reader, b"ralph-issue")?;
126                        issues.push(issue_text);
127                    }
128                    b"ralph-no-issues-found" => {
129                        // Cannot mix issues and no-issues-found
130                        if !issues.is_empty() {
131                            return Err(XsdValidationError {
132                                error_type: XsdErrorType::UnexpectedElement,
133                                element_path: "ralph-issues/ralph-no-issues-found".to_string(),
134                                expected: "either <ralph-issue> elements OR <ralph-no-issues-found>, not both".to_string(),
135                                found: "mixed issues and no-issues-found".to_string(),
136                                suggestion: "Use <ralph-issue> when issues exist, or <ralph-no-issues-found> when no issues exist.".to_string(),
137                                example: Some(EXAMPLE_NO_ISSUES_XML.into()),
138                            });
139                        }
140                        if no_issues_found.is_some() {
141                            return Err(duplicate_element_error(
142                                "ralph-no-issues-found",
143                                "ralph-issues",
144                            ));
145                        }
146                        no_issues_found =
147                            Some(read_text_until_end(&mut reader, b"ralph-no-issues-found")?);
148                    }
149                    other => {
150                        let _ = skip_to_end(&mut reader, other);
151                        return Err(unexpected_element_error(other, &VALID_TAGS, "ralph-issues"));
152                    }
153                }
154            }
155            Ok(Event::Text(e)) => {
156                let text = e.unescape().unwrap_or_default();
157                let trimmed = text.trim();
158                if !trimmed.is_empty() {
159                    return Err(text_outside_tags_error(trimmed, "ralph-issues"));
160                }
161            }
162            Ok(Event::End(e)) if e.name().as_ref() == b"ralph-issues" => break,
163            Ok(Event::Eof) => {
164                return Err(XsdValidationError {
165                    error_type: XsdErrorType::MalformedXml,
166                    element_path: "ralph-issues".to_string(),
167                    expected: "closing </ralph-issues> tag".to_string(),
168                    found: "end of content without closing tag".to_string(),
169                    suggestion: "Add </ralph-issues> at the end.".to_string(),
170                    example: Some(EXAMPLE_ISSUES_XML.into()),
171                });
172            }
173            Ok(_) => {} // Skip comments, etc.
174            Err(e) => return Err(malformed_xml_error(e)),
175        }
176    }
177
178    // Filter out empty issues
179    let filtered_issues: Vec<String> = issues.into_iter().filter(|s| !s.is_empty()).collect();
180    let filtered_no_issues = no_issues_found.filter(|s| !s.is_empty());
181
182    // Must have either issues or no-issues-found
183    if filtered_issues.is_empty() && filtered_no_issues.is_none() {
184        return Err(XsdValidationError {
185            error_type: XsdErrorType::MissingRequiredElement,
186            element_path: "ralph-issues".to_string(),
187            expected: "at least one <ralph-issue> element OR <ralph-no-issues-found>".to_string(),
188            found: "empty <ralph-issues> element".to_string(),
189            suggestion:
190                "Add <ralph-issue> elements for issues found, or <ralph-no-issues-found> if no issues exist."
191                    .to_string(),
192            example: Some(EXAMPLE_ISSUES_XML.into()),
193        });
194    }
195
196    Ok(IssuesElements {
197        issues: filtered_issues,
198        no_issues_found: filtered_no_issues,
199    })
200}
201
202/// Parsed issues elements from valid XML.
203#[derive(Debug, Clone, PartialEq, Eq)]
204pub struct IssuesElements {
205    /// List of issues (if any)
206    pub issues: Vec<String>,
207    /// No issues found message (if no issues)
208    pub no_issues_found: Option<String>,
209}
210
211impl IssuesElements {
212    /// Returns true if there are no issues.
213    #[cfg(any(test, feature = "test-utils"))]
214    pub fn is_empty(&self) -> bool {
215        self.issues.is_empty() && self.no_issues_found.is_some()
216    }
217
218    /// Returns the number of issues.
219    #[cfg(any(test, feature = "test-utils"))]
220    pub fn issue_count(&self) -> usize {
221        self.issues.len()
222    }
223}
224
225#[cfg(test)]
226mod tests {
227    use super::*;
228
229    #[test]
230    fn test_validate_valid_single_issue() {
231        let xml = r#"<ralph-issues>
232<ralph-issue>First issue description</ralph-issue>
233</ralph-issues>"#;
234
235        let result = validate_issues_xml(xml);
236        assert!(result.is_ok());
237        let elements = result.unwrap();
238        assert_eq!(elements.issues.len(), 1);
239        assert_eq!(elements.issues[0], "First issue description");
240        assert!(elements.no_issues_found.is_none());
241    }
242
243    #[test]
244    fn test_validate_valid_multiple_issues() {
245        let xml = r#"<ralph-issues>
246<ralph-issue>First issue</ralph-issue>
247<ralph-issue>Second issue</ralph-issue>
248<ralph-issue>Third issue</ralph-issue>
249</ralph-issues>"#;
250
251        let result = validate_issues_xml(xml);
252        assert!(result.is_ok());
253        let elements = result.unwrap();
254        assert_eq!(elements.issues.len(), 3);
255        assert_eq!(elements.issue_count(), 3);
256    }
257
258    #[test]
259    fn test_validate_valid_no_issues_found() {
260        let xml = r#"<ralph-issues>
261<ralph-no-issues-found>No issues were found during review</ralph-no-issues-found>
262</ralph-issues>"#;
263
264        let result = validate_issues_xml(xml);
265        assert!(result.is_ok());
266        let elements = result.unwrap();
267        assert!(elements.issues.is_empty());
268        assert!(elements.no_issues_found.is_some());
269        assert!(elements.is_empty());
270    }
271
272    #[test]
273    fn test_validate_missing_root_element() {
274        let xml = r#"Some random text without proper XML tags"#;
275
276        let result = validate_issues_xml(xml);
277        assert!(result.is_err());
278        let error = result.unwrap_err();
279        assert_eq!(error.element_path, "ralph-issues");
280    }
281
282    #[test]
283    fn test_validate_empty_issues() {
284        let xml = r#"<ralph-issues>
285</ralph-issues>"#;
286
287        let result = validate_issues_xml(xml);
288        assert!(result.is_err());
289        let error = result.unwrap_err();
290        assert!(error.expected.contains("at least one"));
291    }
292
293    #[test]
294    fn test_validate_mixed_issues_and_no_issues_found() {
295        let xml = r#"<ralph-issues>
296<ralph-issue>First issue</ralph-issue>
297<ralph-no-issues-found>No issues</ralph-no-issues-found>
298</ralph-issues>"#;
299
300        let result = validate_issues_xml(xml);
301        assert!(result.is_err());
302        let error = result.unwrap_err();
303        assert!(error.suggestion.contains("not both") || error.expected.contains("not both"));
304    }
305
306    #[test]
307    fn test_validate_duplicate_no_issues_found() {
308        let xml = r#"<ralph-issues>
309<ralph-no-issues-found>No issues</ralph-no-issues-found>
310<ralph-no-issues-found>Also no issues</ralph-no-issues-found>
311</ralph-issues>"#;
312
313        let result = validate_issues_xml(xml);
314        assert!(result.is_err());
315    }
316
317    #[test]
318    fn test_validate_whitespace_handling() {
319        // This is the key test - quick_xml should handle whitespace between elements
320        let xml =
321            "  <ralph-issues>  \n  <ralph-issue>Issue text</ralph-issue>  \n  </ralph-issues>  ";
322
323        let result = validate_issues_xml(xml);
324        assert!(result.is_ok());
325    }
326
327    #[test]
328    fn test_validate_with_xml_declaration() {
329        let xml = r#"<?xml version="1.0"?>
330<ralph-issues>
331<ralph-issue>Issue text</ralph-issue>
332</ralph-issues>"#;
333
334        let result = validate_issues_xml(xml);
335        assert!(result.is_ok());
336    }
337
338    #[test]
339    fn test_validate_issue_with_code_element() {
340        // XSD now allows <code> elements for escaping special characters
341        let xml = r#"<ralph-issues>
342<ralph-issue>Check if <code>a &lt; b</code> is valid</ralph-issue>
343</ralph-issues>"#;
344
345        let result = validate_issues_xml(xml);
346        assert!(result.is_ok());
347        let elements = result.unwrap();
348        assert_eq!(elements.issues.len(), 1);
349        // The text from both outside and inside <code> should be collected
350        assert!(elements.issues[0].contains("Check if"));
351        assert!(elements.issues[0].contains("a < b"));
352        assert!(elements.issues[0].contains("is valid"));
353    }
354
355    #[test]
356    fn test_validate_no_issues_with_code_element() {
357        let xml = r#"<ralph-issues>
358<ralph-no-issues-found>All <code>Record&lt;string, T&gt;</code> types are correct</ralph-no-issues-found>
359</ralph-issues>"#;
360
361        let result = validate_issues_xml(xml);
362        assert!(result.is_ok());
363        let elements = result.unwrap();
364        assert!(elements.no_issues_found.is_some());
365        let msg = elements.no_issues_found.unwrap();
366        assert!(msg.contains("Record<string, T>"));
367    }
368
369    // =========================================================================
370    // REALISTIC LLM OUTPUT TESTS
371    // These test actual patterns that LLMs produce when following the prompts
372    // =========================================================================
373
374    #[test]
375    fn test_llm_realistic_issue_with_generic_type_escaped() {
376        // LLM correctly escapes generic types per prompt instructions
377        let xml = r#"<ralph-issues>
378<ralph-issue>[High] src/parser.rs:42 - The function <code>parse&lt;T&gt;</code> does not handle empty input.
379Suggested fix: Add a check for empty input before parsing.</ralph-issue>
380</ralph-issues>"#;
381
382        let result = validate_issues_xml(xml);
383        assert!(result.is_ok(), "Should parse escaped generic: {:?}", result);
384        let elements = result.unwrap();
385        assert!(elements.issues[0].contains("parse<T>"));
386    }
387
388    #[test]
389    fn test_llm_realistic_issue_with_comparison_escaped() {
390        // LLM correctly escapes comparison operators
391        let xml = r#"<ralph-issues>
392<ralph-issue>[Medium] src/validate.rs:15 - The condition <code>count &lt; 0</code> should be <code>count &lt;= 0</code>.
393Suggested fix: Change the comparison operator.</ralph-issue>
394</ralph-issues>"#;
395
396        let result = validate_issues_xml(xml);
397        assert!(
398            result.is_ok(),
399            "Should parse escaped comparisons: {:?}",
400            result
401        );
402        let elements = result.unwrap();
403        assert!(elements.issues[0].contains("count < 0"));
404        assert!(elements.issues[0].contains("count <= 0"));
405    }
406
407    #[test]
408    fn test_llm_realistic_issue_with_logical_operators_escaped() {
409        // LLM escapes && and || operators
410        let xml = r#"<ralph-issues>
411<ralph-issue>[Low] src/filter.rs:88 - The expression <code>a &amp;&amp; b || c</code> has ambiguous precedence.
412Suggested fix: Add explicit parentheses.</ralph-issue>
413</ralph-issues>"#;
414
415        let result = validate_issues_xml(xml);
416        assert!(
417            result.is_ok(),
418            "Should parse escaped logical operators: {:?}",
419            result
420        );
421        let elements = result.unwrap();
422        assert!(elements.issues[0].contains("a && b || c"));
423    }
424
425    #[test]
426    fn test_llm_realistic_issue_with_rust_lifetime() {
427        // LLM references Rust lifetime syntax
428        let xml = r#"<ralph-issues>
429<ralph-issue>[High] src/buffer.rs:23 - The lifetime <code>&amp;'a str</code> should match the struct lifetime.
430Suggested fix: Ensure lifetime annotations are consistent.</ralph-issue>
431</ralph-issues>"#;
432
433        let result = validate_issues_xml(xml);
434        assert!(result.is_ok(), "Should parse lifetime syntax: {:?}", result);
435        let elements = result.unwrap();
436        assert!(elements.issues[0].contains("&'a str"));
437    }
438
439    #[test]
440    fn test_llm_realistic_issue_with_html_in_description() {
441        // LLM describes HTML-related code
442        let xml = r#"<ralph-issues>
443<ralph-issue>[Medium] src/template.rs:56 - The HTML template uses <code>&lt;div class="container"&gt;</code> but should use semantic tags.
444Suggested fix: Replace with appropriate semantic HTML elements.</ralph-issue>
445</ralph-issues>"#;
446
447        let result = validate_issues_xml(xml);
448        assert!(result.is_ok(), "Should parse HTML in code: {:?}", result);
449        let elements = result.unwrap();
450        assert!(elements.issues[0].contains("<div class=\"container\">"));
451    }
452
453    #[test]
454    fn test_llm_realistic_no_issues_with_detailed_explanation() {
455        // LLM provides detailed explanation when no issues found
456        let xml = r#"<ralph-issues>
457<ralph-no-issues-found>The implementation correctly handles all edge cases:
458- Input validation properly rejects values where <code>x &lt; 0</code>
459- The generic <code>Result&lt;T, E&gt;</code> type is used consistently
460- Error handling follows the project's established patterns
461No issues require attention.</ralph-no-issues-found>
462</ralph-issues>"#;
463
464        let result = validate_issues_xml(xml);
465        assert!(
466            result.is_ok(),
467            "Should parse detailed no-issues: {:?}",
468            result
469        );
470        let elements = result.unwrap();
471        let msg = elements.no_issues_found.unwrap();
472        assert!(msg.contains("x < 0"));
473        assert!(msg.contains("Result<T, E>"));
474    }
475
476    #[test]
477    fn test_llm_realistic_multiple_issues_with_mixed_content() {
478        // LLM reports multiple issues with various escaped content
479        let xml = r#"<ralph-issues>
480<ralph-issue>[Critical] src/auth.rs:12 - SQL injection vulnerability: user input in <code>query &amp;&amp; filter</code> is not sanitized.</ralph-issue>
481<ralph-issue>[High] src/api.rs:45 - Missing null check: <code>response.data</code> may be undefined when <code>status &lt; 200</code>.</ralph-issue>
482<ralph-issue>[Medium] src/utils.rs:78 - The type <code>Option&lt;Vec&lt;T&gt;&gt;</code> could be simplified to <code>Vec&lt;T&gt;</code> with empty default.</ralph-issue>
483</ralph-issues>"#;
484
485        let result = validate_issues_xml(xml);
486        assert!(
487            result.is_ok(),
488            "Should parse multiple issues with mixed content: {:?}",
489            result
490        );
491        let elements = result.unwrap();
492        assert_eq!(elements.issues.len(), 3);
493        assert!(elements.issues[0].contains("query && filter"));
494        assert!(elements.issues[1].contains("status < 200"));
495        assert!(elements.issues[2].contains("Option<Vec<T>>"));
496    }
497
498    #[test]
499    fn test_llm_mistake_unescaped_less_than_fails() {
500        // LLM forgets to escape < - this SHOULD fail
501        let xml = r#"<ralph-issues>
502<ralph-issue>[High] src/compare.rs:10 - The condition a < b is wrong.</ralph-issue>
503</ralph-issues>"#;
504
505        let result = validate_issues_xml(xml);
506        assert!(
507            result.is_err(),
508            "Unescaped < should fail XML parsing: {:?}",
509            result
510        );
511    }
512
513    #[test]
514    fn test_llm_mistake_unescaped_generic_fails() {
515        // LLM forgets to escape generic type - this SHOULD fail
516        let xml = r#"<ralph-issues>
517<ralph-issue>[High] src/types.rs:5 - The type Vec<String> is incorrect.</ralph-issue>
518</ralph-issues>"#;
519
520        let result = validate_issues_xml(xml);
521        assert!(
522            result.is_err(),
523            "Unescaped generic should fail XML parsing: {:?}",
524            result
525        );
526    }
527
528    #[test]
529    fn test_llm_mistake_unescaped_ampersand_fails() {
530        // LLM forgets to escape & - this SHOULD fail
531        let xml = r#"<ralph-issues>
532<ralph-issue>[High] src/logic.rs:20 - The expression a && b is wrong.</ralph-issue>
533</ralph-issues>"#;
534
535        let result = validate_issues_xml(xml);
536        assert!(
537            result.is_err(),
538            "Unescaped && should fail XML parsing: {:?}",
539            result
540        );
541    }
542
543    #[test]
544    fn test_llm_uses_cdata_for_code_content() {
545        // LLM uses CDATA instead of escaping (valid alternative)
546        let xml = r#"<ralph-issues>
547<ralph-issue>[High] src/cmp.rs:10 - The condition <code><![CDATA[a < b && c > d]]></code> has issues.</ralph-issue>
548</ralph-issues>"#;
549
550        let result = validate_issues_xml(xml);
551        assert!(result.is_ok(), "CDATA should be valid: {:?}", result);
552        let elements = result.unwrap();
553        assert!(elements.issues[0].contains("a < b && c > d"));
554    }
555}