Skip to main content

ralph_workflow/files/llm_output_extraction/
xsd_validation_issues.rs

1//! XSD validation for issues XML format.
2//!
3//! This module provides validation of XML output against the XSD schema
4//! to ensure AI agent output conforms to the expected format for review issues.
5//!
6//! Uses quick_xml for robust XML parsing with proper whitespace handling.
7
8use crate::files::llm_output_extraction::xml_helpers::{
9    create_reader, duplicate_element_error, format_content_preview, malformed_xml_error,
10    read_text_until_end, skip_to_end, text_outside_tags_error, unexpected_element_error,
11};
12use crate::files::llm_output_extraction::xsd_validation::{XsdErrorType, XsdValidationError};
13use quick_xml::events::Event;
14
15/// Example of valid issues XML with issues.
16const EXAMPLE_ISSUES_XML: &str = r#"<ralph-issues>
17<ralph-issue>Missing error handling in API endpoint</ralph-issue>
18<ralph-issue>Variable shadowing in loop construct</ralph-issue>
19</ralph-issues>"#;
20
21/// Example of valid issues XML with no issues.
22const EXAMPLE_NO_ISSUES_XML: &str = r#"<ralph-issues>
23<ralph-no-issues-found>No issues were found during review</ralph-no-issues-found>
24</ralph-issues>"#;
25
26/// Validate issues XML content against the XSD schema.
27///
28/// This function validates that the XML content conforms to the expected
29/// issues format defined in issues.xsd:
30///
31/// ```xml
32/// <ralph-issues>
33///   <ralph-issue>First issue description</ralph-issue>
34///   <ralph-issue>Second issue description</ralph-issue>
35///   ...
36/// </ralph-issues>
37/// ```
38///
39/// OR for no issues:
40///
41/// ```xml
42/// <ralph-issues>
43///   <ralph-no-issues-found>No issues were found during review</ralph-no-issues-found>
44/// </ralph-issues>
45/// ```
46///
47/// # Arguments
48///
49/// * `xml_content` - The XML content to validate
50///
51/// # Returns
52///
53/// * `Ok(IssuesElements)` if the XML is valid and contains all required elements
54/// * `Err(XsdValidationError)` if the XML is invalid or doesn't conform to the schema
55pub fn validate_issues_xml(xml_content: &str) -> Result<IssuesElements, XsdValidationError> {
56    let content = xml_content.trim();
57    let mut reader = create_reader(content);
58    let mut buf = Vec::new();
59
60    // Find the root element
61    loop {
62        match reader.read_event_into(&mut buf) {
63            Ok(Event::Start(e)) if e.name().as_ref() == b"ralph-issues" => break,
64            Ok(Event::Start(e)) => {
65                let name_bytes = e.name();
66                let tag_name = String::from_utf8_lossy(name_bytes.as_ref());
67                return Err(XsdValidationError {
68                    error_type: XsdErrorType::MissingRequiredElement,
69                    element_path: "ralph-issues".to_string(),
70                    expected: "<ralph-issues> as root element".to_string(),
71                    found: format!("<{}> (wrong root element)", tag_name),
72                    suggestion: "Use <ralph-issues> as the root element.".to_string(),
73                    example: Some(EXAMPLE_ISSUES_XML.into()),
74                });
75            }
76            Ok(Event::Text(_)) => {
77                // Text before root element - continue to EOF error which is more informative
78            }
79            Ok(Event::Eof) => {
80                return Err(XsdValidationError {
81                    error_type: XsdErrorType::MissingRequiredElement,
82                    element_path: "ralph-issues".to_string(),
83                    expected: "<ralph-issues> as root element".to_string(),
84                    found: format_content_preview(content),
85                    suggestion: "Wrap your issues in <ralph-issues>...</ralph-issues> tags."
86                        .to_string(),
87                    example: Some(EXAMPLE_ISSUES_XML.into()),
88                });
89            }
90            Ok(_) => {} // Skip XML declaration, comments, etc.
91            Err(e) => return Err(malformed_xml_error(e)),
92        }
93        buf.clear();
94    }
95
96    // Parse child elements
97    let mut issues: Vec<String> = Vec::new();
98    let mut no_issues_found: Option<String> = None;
99
100    const VALID_TAGS: [&str; 2] = ["ralph-issue", "ralph-no-issues-found"];
101
102    loop {
103        buf.clear();
104        match reader.read_event_into(&mut buf) {
105            Ok(Event::Start(e)) => {
106                match e.name().as_ref() {
107                    b"ralph-issue" => {
108                        // Cannot mix issues and no-issues-found
109                        if no_issues_found.is_some() {
110                            return Err(XsdValidationError {
111                                error_type: XsdErrorType::UnexpectedElement,
112                                element_path: "ralph-issues/ralph-issue".to_string(),
113                                expected: "either <ralph-issue> elements OR <ralph-no-issues-found>, not both".to_string(),
114                                found: "mixed issues and no-issues-found".to_string(),
115                                suggestion: "Use <ralph-issue> when issues exist, or <ralph-no-issues-found> when no issues exist.".to_string(),
116                                example: Some(EXAMPLE_ISSUES_XML.into()),
117                            });
118                        }
119                        let issue_text = read_text_until_end(&mut reader, b"ralph-issue")?;
120                        issues.push(issue_text);
121                    }
122                    b"ralph-no-issues-found" => {
123                        // Cannot mix issues and no-issues-found
124                        if !issues.is_empty() {
125                            return Err(XsdValidationError {
126                                error_type: XsdErrorType::UnexpectedElement,
127                                element_path: "ralph-issues/ralph-no-issues-found".to_string(),
128                                expected: "either <ralph-issue> elements OR <ralph-no-issues-found>, not both".to_string(),
129                                found: "mixed issues and no-issues-found".to_string(),
130                                suggestion: "Use <ralph-issue> when issues exist, or <ralph-no-issues-found> when no issues exist.".to_string(),
131                                example: Some(EXAMPLE_NO_ISSUES_XML.into()),
132                            });
133                        }
134                        if no_issues_found.is_some() {
135                            return Err(duplicate_element_error(
136                                "ralph-no-issues-found",
137                                "ralph-issues",
138                            ));
139                        }
140                        no_issues_found =
141                            Some(read_text_until_end(&mut reader, b"ralph-no-issues-found")?);
142                    }
143                    other => {
144                        let _ = skip_to_end(&mut reader, other);
145                        return Err(unexpected_element_error(other, &VALID_TAGS, "ralph-issues"));
146                    }
147                }
148            }
149            Ok(Event::Text(e)) => {
150                let text = e.unescape().unwrap_or_default();
151                let trimmed = text.trim();
152                if !trimmed.is_empty() {
153                    return Err(text_outside_tags_error(trimmed, "ralph-issues"));
154                }
155            }
156            Ok(Event::End(e)) if e.name().as_ref() == b"ralph-issues" => break,
157            Ok(Event::Eof) => {
158                return Err(XsdValidationError {
159                    error_type: XsdErrorType::MalformedXml,
160                    element_path: "ralph-issues".to_string(),
161                    expected: "closing </ralph-issues> tag".to_string(),
162                    found: "end of content without closing tag".to_string(),
163                    suggestion: "Add </ralph-issues> at the end.".to_string(),
164                    example: Some(EXAMPLE_ISSUES_XML.into()),
165                });
166            }
167            Ok(_) => {} // Skip comments, etc.
168            Err(e) => return Err(malformed_xml_error(e)),
169        }
170    }
171
172    // Filter out empty issues
173    let filtered_issues: Vec<String> = issues.into_iter().filter(|s| !s.is_empty()).collect();
174    let filtered_no_issues = no_issues_found.filter(|s| !s.is_empty());
175
176    // Must have either issues or no-issues-found
177    if filtered_issues.is_empty() && filtered_no_issues.is_none() {
178        return Err(XsdValidationError {
179            error_type: XsdErrorType::MissingRequiredElement,
180            element_path: "ralph-issues".to_string(),
181            expected: "at least one <ralph-issue> element OR <ralph-no-issues-found>".to_string(),
182            found: "empty <ralph-issues> element".to_string(),
183            suggestion:
184                "Add <ralph-issue> elements for issues found, or <ralph-no-issues-found> if no issues exist."
185                    .to_string(),
186            example: Some(EXAMPLE_ISSUES_XML.into()),
187        });
188    }
189
190    Ok(IssuesElements {
191        issues: filtered_issues,
192        no_issues_found: filtered_no_issues,
193    })
194}
195
196/// Parsed issues elements from valid XML.
197#[derive(Debug, Clone, PartialEq, Eq)]
198pub struct IssuesElements {
199    /// List of issues (if any)
200    pub issues: Vec<String>,
201    /// No issues found message (if no issues)
202    pub no_issues_found: Option<String>,
203}
204
205impl IssuesElements {
206    /// Returns true if there are no issues.
207    #[cfg(any(test, feature = "test-utils"))]
208    pub fn is_empty(&self) -> bool {
209        self.issues.is_empty() && self.no_issues_found.is_some()
210    }
211
212    /// Returns the number of issues.
213    #[cfg(any(test, feature = "test-utils"))]
214    pub fn issue_count(&self) -> usize {
215        self.issues.len()
216    }
217}
218
219#[cfg(test)]
220mod tests {
221    use super::*;
222
223    #[test]
224    fn test_validate_valid_single_issue() {
225        let xml = r#"<ralph-issues>
226<ralph-issue>First issue description</ralph-issue>
227</ralph-issues>"#;
228
229        let result = validate_issues_xml(xml);
230        assert!(result.is_ok());
231        let elements = result.unwrap();
232        assert_eq!(elements.issues.len(), 1);
233        assert_eq!(elements.issues[0], "First issue description");
234        assert!(elements.no_issues_found.is_none());
235    }
236
237    #[test]
238    fn test_validate_valid_multiple_issues() {
239        let xml = r#"<ralph-issues>
240<ralph-issue>First issue</ralph-issue>
241<ralph-issue>Second issue</ralph-issue>
242<ralph-issue>Third issue</ralph-issue>
243</ralph-issues>"#;
244
245        let result = validate_issues_xml(xml);
246        assert!(result.is_ok());
247        let elements = result.unwrap();
248        assert_eq!(elements.issues.len(), 3);
249        assert_eq!(elements.issue_count(), 3);
250    }
251
252    #[test]
253    fn test_validate_valid_no_issues_found() {
254        let xml = r#"<ralph-issues>
255<ralph-no-issues-found>No issues were found during review</ralph-no-issues-found>
256</ralph-issues>"#;
257
258        let result = validate_issues_xml(xml);
259        assert!(result.is_ok());
260        let elements = result.unwrap();
261        assert!(elements.issues.is_empty());
262        assert!(elements.no_issues_found.is_some());
263        assert!(elements.is_empty());
264    }
265
266    #[test]
267    fn test_validate_missing_root_element() {
268        let xml = r#"Some random text without proper XML tags"#;
269
270        let result = validate_issues_xml(xml);
271        assert!(result.is_err());
272        let error = result.unwrap_err();
273        assert_eq!(error.element_path, "ralph-issues");
274    }
275
276    #[test]
277    fn test_validate_empty_issues() {
278        let xml = r#"<ralph-issues>
279</ralph-issues>"#;
280
281        let result = validate_issues_xml(xml);
282        assert!(result.is_err());
283        let error = result.unwrap_err();
284        assert!(error.expected.contains("at least one"));
285    }
286
287    #[test]
288    fn test_validate_mixed_issues_and_no_issues_found() {
289        let xml = r#"<ralph-issues>
290<ralph-issue>First issue</ralph-issue>
291<ralph-no-issues-found>No issues</ralph-no-issues-found>
292</ralph-issues>"#;
293
294        let result = validate_issues_xml(xml);
295        assert!(result.is_err());
296        let error = result.unwrap_err();
297        assert!(error.suggestion.contains("not both") || error.expected.contains("not both"));
298    }
299
300    #[test]
301    fn test_validate_duplicate_no_issues_found() {
302        let xml = r#"<ralph-issues>
303<ralph-no-issues-found>No issues</ralph-no-issues-found>
304<ralph-no-issues-found>Also no issues</ralph-no-issues-found>
305</ralph-issues>"#;
306
307        let result = validate_issues_xml(xml);
308        assert!(result.is_err());
309    }
310
311    #[test]
312    fn test_validate_whitespace_handling() {
313        // This is the key test - quick_xml should handle whitespace between elements
314        let xml =
315            "  <ralph-issues>  \n  <ralph-issue>Issue text</ralph-issue>  \n  </ralph-issues>  ";
316
317        let result = validate_issues_xml(xml);
318        assert!(result.is_ok());
319    }
320
321    #[test]
322    fn test_validate_with_xml_declaration() {
323        let xml = r#"<?xml version="1.0"?>
324<ralph-issues>
325<ralph-issue>Issue text</ralph-issue>
326</ralph-issues>"#;
327
328        let result = validate_issues_xml(xml);
329        assert!(result.is_ok());
330    }
331
332    #[test]
333    fn test_validate_issue_with_code_element() {
334        // XSD now allows <code> elements for escaping special characters
335        let xml = r#"<ralph-issues>
336<ralph-issue>Check if <code>a &lt; b</code> is valid</ralph-issue>
337</ralph-issues>"#;
338
339        let result = validate_issues_xml(xml);
340        assert!(result.is_ok());
341        let elements = result.unwrap();
342        assert_eq!(elements.issues.len(), 1);
343        // The text from both outside and inside <code> should be collected
344        assert!(elements.issues[0].contains("Check if"));
345        assert!(elements.issues[0].contains("a < b"));
346        assert!(elements.issues[0].contains("is valid"));
347    }
348
349    #[test]
350    fn test_validate_no_issues_with_code_element() {
351        let xml = r#"<ralph-issues>
352<ralph-no-issues-found>All <code>Record&lt;string, T&gt;</code> types are correct</ralph-no-issues-found>
353</ralph-issues>"#;
354
355        let result = validate_issues_xml(xml);
356        assert!(result.is_ok());
357        let elements = result.unwrap();
358        assert!(elements.no_issues_found.is_some());
359        let msg = elements.no_issues_found.unwrap();
360        assert!(msg.contains("Record<string, T>"));
361    }
362
363    // =========================================================================
364    // REALISTIC LLM OUTPUT TESTS
365    // These test actual patterns that LLMs produce when following the prompts
366    // =========================================================================
367
368    #[test]
369    fn test_llm_realistic_issue_with_generic_type_escaped() {
370        // LLM correctly escapes generic types per prompt instructions
371        let xml = r#"<ralph-issues>
372<ralph-issue>[High] src/parser.rs:42 - The function <code>parse&lt;T&gt;</code> does not handle empty input.
373Suggested fix: Add a check for empty input before parsing.</ralph-issue>
374</ralph-issues>"#;
375
376        let result = validate_issues_xml(xml);
377        assert!(result.is_ok(), "Should parse escaped generic: {:?}", result);
378        let elements = result.unwrap();
379        assert!(elements.issues[0].contains("parse<T>"));
380    }
381
382    #[test]
383    fn test_llm_realistic_issue_with_comparison_escaped() {
384        // LLM correctly escapes comparison operators
385        let xml = r#"<ralph-issues>
386<ralph-issue>[Medium] src/validate.rs:15 - The condition <code>count &lt; 0</code> should be <code>count &lt;= 0</code>.
387Suggested fix: Change the comparison operator.</ralph-issue>
388</ralph-issues>"#;
389
390        let result = validate_issues_xml(xml);
391        assert!(
392            result.is_ok(),
393            "Should parse escaped comparisons: {:?}",
394            result
395        );
396        let elements = result.unwrap();
397        assert!(elements.issues[0].contains("count < 0"));
398        assert!(elements.issues[0].contains("count <= 0"));
399    }
400
401    #[test]
402    fn test_llm_realistic_issue_with_logical_operators_escaped() {
403        // LLM escapes && and || operators
404        let xml = r#"<ralph-issues>
405<ralph-issue>[Low] src/filter.rs:88 - The expression <code>a &amp;&amp; b || c</code> has ambiguous precedence.
406Suggested fix: Add explicit parentheses.</ralph-issue>
407</ralph-issues>"#;
408
409        let result = validate_issues_xml(xml);
410        assert!(
411            result.is_ok(),
412            "Should parse escaped logical operators: {:?}",
413            result
414        );
415        let elements = result.unwrap();
416        assert!(elements.issues[0].contains("a && b || c"));
417    }
418
419    #[test]
420    fn test_llm_realistic_issue_with_rust_lifetime() {
421        // LLM references Rust lifetime syntax
422        let xml = r#"<ralph-issues>
423<ralph-issue>[High] src/buffer.rs:23 - The lifetime <code>&amp;'a str</code> should match the struct lifetime.
424Suggested fix: Ensure lifetime annotations are consistent.</ralph-issue>
425</ralph-issues>"#;
426
427        let result = validate_issues_xml(xml);
428        assert!(result.is_ok(), "Should parse lifetime syntax: {:?}", result);
429        let elements = result.unwrap();
430        assert!(elements.issues[0].contains("&'a str"));
431    }
432
433    #[test]
434    fn test_llm_realistic_issue_with_html_in_description() {
435        // LLM describes HTML-related code
436        let xml = r#"<ralph-issues>
437<ralph-issue>[Medium] src/template.rs:56 - The HTML template uses <code>&lt;div class="container"&gt;</code> but should use semantic tags.
438Suggested fix: Replace with appropriate semantic HTML elements.</ralph-issue>
439</ralph-issues>"#;
440
441        let result = validate_issues_xml(xml);
442        assert!(result.is_ok(), "Should parse HTML in code: {:?}", result);
443        let elements = result.unwrap();
444        assert!(elements.issues[0].contains("<div class=\"container\">"));
445    }
446
447    #[test]
448    fn test_llm_realistic_no_issues_with_detailed_explanation() {
449        // LLM provides detailed explanation when no issues found
450        let xml = r#"<ralph-issues>
451<ralph-no-issues-found>The implementation correctly handles all edge cases:
452- Input validation properly rejects values where <code>x &lt; 0</code>
453- The generic <code>Result&lt;T, E&gt;</code> type is used consistently
454- Error handling follows the project's established patterns
455No issues require attention.</ralph-no-issues-found>
456</ralph-issues>"#;
457
458        let result = validate_issues_xml(xml);
459        assert!(
460            result.is_ok(),
461            "Should parse detailed no-issues: {:?}",
462            result
463        );
464        let elements = result.unwrap();
465        let msg = elements.no_issues_found.unwrap();
466        assert!(msg.contains("x < 0"));
467        assert!(msg.contains("Result<T, E>"));
468    }
469
470    #[test]
471    fn test_llm_realistic_multiple_issues_with_mixed_content() {
472        // LLM reports multiple issues with various escaped content
473        let xml = r#"<ralph-issues>
474<ralph-issue>[Critical] src/auth.rs:12 - SQL injection vulnerability: user input in <code>query &amp;&amp; filter</code> is not sanitized.</ralph-issue>
475<ralph-issue>[High] src/api.rs:45 - Missing null check: <code>response.data</code> may be undefined when <code>status &lt; 200</code>.</ralph-issue>
476<ralph-issue>[Medium] src/utils.rs:78 - The type <code>Option&lt;Vec&lt;T&gt;&gt;</code> could be simplified to <code>Vec&lt;T&gt;</code> with empty default.</ralph-issue>
477</ralph-issues>"#;
478
479        let result = validate_issues_xml(xml);
480        assert!(
481            result.is_ok(),
482            "Should parse multiple issues with mixed content: {:?}",
483            result
484        );
485        let elements = result.unwrap();
486        assert_eq!(elements.issues.len(), 3);
487        assert!(elements.issues[0].contains("query && filter"));
488        assert!(elements.issues[1].contains("status < 200"));
489        assert!(elements.issues[2].contains("Option<Vec<T>>"));
490    }
491
492    #[test]
493    fn test_llm_mistake_unescaped_less_than_fails() {
494        // LLM forgets to escape < - this SHOULD fail
495        let xml = r#"<ralph-issues>
496<ralph-issue>[High] src/compare.rs:10 - The condition a < b is wrong.</ralph-issue>
497</ralph-issues>"#;
498
499        let result = validate_issues_xml(xml);
500        assert!(
501            result.is_err(),
502            "Unescaped < should fail XML parsing: {:?}",
503            result
504        );
505    }
506
507    #[test]
508    fn test_llm_mistake_unescaped_generic_fails() {
509        // LLM forgets to escape generic type - this SHOULD fail
510        let xml = r#"<ralph-issues>
511<ralph-issue>[High] src/types.rs:5 - The type Vec<String> is incorrect.</ralph-issue>
512</ralph-issues>"#;
513
514        let result = validate_issues_xml(xml);
515        assert!(
516            result.is_err(),
517            "Unescaped generic should fail XML parsing: {:?}",
518            result
519        );
520    }
521
522    #[test]
523    fn test_llm_mistake_unescaped_ampersand_fails() {
524        // LLM forgets to escape & - this SHOULD fail
525        let xml = r#"<ralph-issues>
526<ralph-issue>[High] src/logic.rs:20 - The expression a && b is wrong.</ralph-issue>
527</ralph-issues>"#;
528
529        let result = validate_issues_xml(xml);
530        assert!(
531            result.is_err(),
532            "Unescaped && should fail XML parsing: {:?}",
533            result
534        );
535    }
536
537    #[test]
538    fn test_llm_uses_cdata_for_code_content() {
539        // LLM uses CDATA instead of escaping (valid alternative)
540        let xml = r#"<ralph-issues>
541<ralph-issue>[High] src/cmp.rs:10 - The condition <code><![CDATA[a < b && c > d]]></code> has issues.</ralph-issue>
542</ralph-issues>"#;
543
544        let result = validate_issues_xml(xml);
545        assert!(result.is_ok(), "CDATA should be valid: {:?}", result);
546        let elements = result.unwrap();
547        assert!(elements.issues[0].contains("a < b && c > d"));
548    }
549}