Skip to main content

ralph_workflow/files/llm_output_extraction/
commit.rs

1//! Commit Message Extraction Functions
2//!
3//! This module provides utilities for extracting commit messages from AI agent output
4//! using XML format with XSD validation.
5
6use super::cleaning::{final_escape_sequence_cleanup, unescape_json_strings_aggressive};
7use super::xml_extraction::extract_xml_commit;
8use super::xsd_validation::validate_xml_against_xsd;
9
10/// Result of commit message extraction.
11///
12/// This struct wraps a successfully extracted commit message.
13#[derive(Debug, Clone, PartialEq, Eq)]
14pub struct CommitExtractionResult(String);
15
16impl CommitExtractionResult {
17    /// Create a new extraction result with the given message.
18    pub fn new(message: String) -> Self {
19        Self(message)
20    }
21
22    /// Convert into the inner message string with final escape sequence cleanup.
23    ///
24    /// This applies the final rendering step to ensure no escape sequences leak through
25    /// to the actual commit message.
26    pub fn into_message(self) -> String {
27        render_final_commit_message(&self.0)
28    }
29}
30
31/// Try to extract commit message from XML format with detailed tracing.
32///
33/// This function uses flexible XML extraction to handle various AI embedding patterns:
34/// - Direct XML tags at content start
35/// - XML in markdown code fences (```xml, ```)
36/// - XML in JSON strings (escaped)
37/// - XML embedded within analysis text
38///
39/// The XML format is preferred because:
40/// - No escape sequence issues (actual newlines work fine)
41/// - Distinctive tags unlikely to appear in LLM analysis text
42/// - Clear boundaries for parsing
43///
44/// # Expected Format
45///
46/// ```xml
47/// <ralph-commit>
48/// <ralph-subject>type(scope): description</ralph-subject>
49/// <ralph-body>Optional body text here.
50/// Can span multiple lines.</ralph-body>
51/// </ralph-commit>
52/// ```
53///
54/// Or with detailed body elements:
55///
56/// ```xml
57/// <ralph-commit>
58/// <ralph-subject>type(scope): description</ralph-subject>
59/// <ralph-body-summary>Brief summary</ralph-body-summary>
60/// <ralph-body-details>Detailed bullet points</ralph-body-details>
61/// <ralph-body-footer>BREAKING CHANGE or Fixes #123</ralph-body-footer>
62/// </ralph-commit>
63/// ```
64///
65/// The `<ralph-body>` tag is optional and may be omitted for commits without a body.
66///
67/// # Returns
68///
69/// A tuple of `(Option<String>, String)`:
70/// - First element: `Some(message)` if valid XML with a valid conventional commit subject was found, `None` otherwise
71/// - Second element: Detailed reason string explaining what was found/not found (for debugging)
72pub fn try_extract_xml_commit_with_trace(content: &str) -> (Option<String>, String) {
73    // Try flexible XML extraction that handles various AI embedding patterns.
74    // If extraction fails, use the raw content directly - XSD validation will
75    // provide a clear error message explaining what's wrong (e.g., missing
76    // <ralph-commit> root element) that can be sent back to the AI for retry.
77    let (xml_block, extraction_pattern) = match extract_xml_commit(content) {
78        Some(xml) => {
79            // Detect which extraction pattern was used for logging
80            let pattern = if content.trim().starts_with("<ralph-commit>") {
81                "direct XML"
82            } else if content.contains("```xml") || content.contains("```\n<ralph-commit>") {
83                "markdown code fence"
84            } else if content.contains("{\"result\":") || content.contains("\"result\":") {
85                "JSON string"
86            } else {
87                "embedded search"
88            };
89            (xml, pattern)
90        }
91        None => {
92            // No XML tags found - use raw content and let XSD validation
93            // produce an informative error for the AI to retry
94            (content.to_string(), "raw content (no XML tags found)")
95        }
96    };
97
98    // Run XSD validation - this will catch both malformed XML and missing elements
99    let xsd_result = validate_xml_against_xsd(&xml_block);
100
101    let message = match xsd_result {
102        Ok(elements) => {
103            // Format the commit message using parsed elements
104            let body = elements.format_body();
105            if body.is_empty() {
106                elements.subject.clone()
107            } else {
108                format!("{}\n\n{}", elements.subject, body)
109            }
110        }
111        Err(e) => {
112            // XSD validation failed - return error with details for AI retry
113            let error_msg = e.format_for_ai_retry();
114            return (None, format!("XSD validation failed: {}", error_msg));
115        }
116    };
117
118    // Determine body presence for logging
119    let has_body = message.lines().count() > 1;
120
121    (
122        Some(message.clone()),
123        format!(
124            "Found <ralph-commit> via {}, XSD validation passed, body={}, message: '{}'",
125            extraction_pattern,
126            if has_body { "present" } else { "absent" },
127            if message.len() > 80 {
128                format!("{}...", &message[..80].replace('\n', "\\n"))
129            } else {
130                message.replace('\n', "\\n")
131            }
132        ),
133    )
134}
135
136/// Check if a string is a valid conventional commit subject line.
137pub fn is_conventional_commit_subject(subject: &str) -> bool {
138    let valid_types = [
139        "feat", "fix", "docs", "style", "refactor", "perf", "test", "build", "ci", "chore",
140    ];
141
142    // Find the colon
143    let Some(colon_pos) = subject.find(':') else {
144        return false;
145    };
146
147    let prefix = &subject[..colon_pos];
148
149    // Extract type (before optional scope and !)
150    let type_end = prefix
151        .find('(')
152        .unwrap_or_else(|| prefix.find('!').unwrap_or(prefix.len()));
153    let commit_type = &prefix[..type_end];
154
155    valid_types.contains(&commit_type)
156}
157
158// =========================================================================
159// Final Commit Message Rendering
160// =========================================================================
161
162/// Render the final commit message with all cleanup applied.
163///
164/// This is the final step before returning a commit message for use in git commit.
165/// It applies:
166/// 1. Escape sequence cleanup (aggressive unescaping)
167/// 2. Final whitespace cleanup
168///
169/// # Arguments
170///
171/// * `message` - The commit message to render
172///
173/// # Returns
174///
175/// The fully rendered commit message with all escape sequences properly handled.
176pub fn render_final_commit_message(message: &str) -> String {
177    let mut result = message.to_string();
178
179    // Step 1: Apply final escape sequence cleanup
180    // This handles any escape sequences that leaked through the pipeline
181    result = final_escape_sequence_cleanup(&result);
182
183    // Step 2: Try aggressive unescaping if there are still escape sequences
184    if result.contains("\\n") || result.contains("\\t") || result.contains("\\r") {
185        result = unescape_json_strings_aggressive(&result);
186    }
187
188    // Step 3: Final whitespace cleanup
189    result = result
190        .lines()
191        .map(str::trim)
192        .filter(|l| !l.is_empty())
193        .collect::<Vec<_>>()
194        .join("\n");
195
196    result
197}
198
199#[cfg(test)]
200mod tests {
201    use super::*;
202
203    // =========================================================================
204    // Tests for CommitExtractionResult
205    // =========================================================================
206
207    #[test]
208    fn test_commit_extraction_result_into_message() {
209        let result = CommitExtractionResult::new("feat: add feature".to_string());
210        assert_eq!(result.into_message(), "feat: add feature");
211    }
212
213    // =========================================================================
214    // Tests for render_final_commit_message
215    // =========================================================================
216
217    #[test]
218    fn test_render_final_commit_message_with_literal_escapes() {
219        // Test that render_final_commit_message cleans up escape sequences
220        // Note: whitespace cleanup removes blank lines
221        let input = "feat: add feature\n\\n\\nBody with literal escapes";
222        let result = render_final_commit_message(input);
223        assert_eq!(result, "feat: add feature\nBody with literal escapes");
224    }
225
226    #[test]
227    fn test_render_final_commit_message_already_clean() {
228        // Test that already-clean messages pass through (whitespace cleanup applied)
229        let input = "feat: add feature\n\nBody text here";
230        let result = render_final_commit_message(input);
231        assert_eq!(result, "feat: add feature\nBody text here");
232    }
233
234    #[test]
235    fn test_render_final_commit_message_with_tabs() {
236        // Test that tab escapes are properly handled
237        let input = "feat: add feature\\n\\t- item 1\\n\\t- item 2";
238        let result = render_final_commit_message(input);
239        // Tabs are stripped by whitespace cleanup (trim() removes leading whitespace)
240        assert_eq!(result, "feat: add feature\n- item 1\n- item 2");
241    }
242
243    #[test]
244    fn test_render_final_commit_message_with_carriage_returns() {
245        // Test that carriage return escapes are properly handled
246        let input = "feat: add feature\\r\\nBody text";
247        let result = render_final_commit_message(input);
248        // Carriage returns are converted, but whitespace cleanup removes extra blank lines
249        assert_eq!(result, "feat: add feature\nBody text");
250    }
251
252    #[test]
253    fn test_render_final_commit_message_whitespace_cleanup() {
254        // Test that trailing empty lines are removed
255        let input = "feat: add feature\n\nBody text\n\n\n  \n  ";
256        let result = render_final_commit_message(input);
257        assert_eq!(result, "feat: add feature\nBody text");
258    }
259
260    #[test]
261    fn test_render_final_commit_message_mixed_escape_sequences() {
262        // Test handling of mixed escape sequences
263        let input = "feat: add feature\\n\\nDetails:\\r\\n\\t- item 1\\n\\t- item 2";
264        let result = render_final_commit_message(input);
265        // Carriage returns normalized to newlines, tabs stripped by trim, blank lines removed
266        assert_eq!(result, "feat: add feature\nDetails:\n- item 1\n- item 2");
267    }
268
269    // =========================================================================
270    // Tests for is_conventional_commit_subject
271    // =========================================================================
272
273    #[test]
274    fn test_conventional_commit_subject_valid() {
275        assert!(is_conventional_commit_subject("feat: add feature"));
276        assert!(is_conventional_commit_subject("fix: resolve bug"));
277        assert!(is_conventional_commit_subject("docs: update readme"));
278        assert!(is_conventional_commit_subject(
279            "refactor(core): simplify logic"
280        ));
281        assert!(is_conventional_commit_subject("feat!: breaking change"));
282        assert!(is_conventional_commit_subject("fix(api)!: breaking fix"));
283    }
284
285    #[test]
286    fn test_conventional_commit_subject_invalid() {
287        assert!(!is_conventional_commit_subject("invalid: not a type"));
288        assert!(!is_conventional_commit_subject("no colon here"));
289        assert!(!is_conventional_commit_subject(""));
290        assert!(!is_conventional_commit_subject("Feature: capitalize"));
291    }
292
293    // =========================================================================
294    // Tests for XML extraction (try_extract_xml_commit_with_trace)
295    // =========================================================================
296
297    #[test]
298    fn test_xml_extract_basic_subject_only() {
299        // Test basic XML extraction with subject only
300        let content = r"<ralph-commit>
301<ralph-subject>feat: add new feature</ralph-subject>
302</ralph-commit>";
303        let (result, reason) = try_extract_xml_commit_with_trace(content);
304        assert!(
305            result.is_some(),
306            "Should extract from basic XML. Reason: {}",
307            reason
308        );
309        assert_eq!(result.unwrap(), "feat: add new feature");
310    }
311
312    #[test]
313    fn test_xml_extract_with_body() {
314        // Test XML extraction with subject and body
315        let content = r"<ralph-commit>
316<ralph-subject>feat(auth): add OAuth2 login flow</ralph-subject>
317<ralph-body>Implement Google and GitHub OAuth providers.
318Add session management for OAuth tokens.</ralph-body>
319</ralph-commit>";
320        let result = try_extract_xml_commit_with_trace(content).0;
321        assert!(result.is_some(), "Should extract from XML with body");
322        let msg = result.unwrap();
323        assert!(msg.starts_with("feat(auth): add OAuth2 login flow"));
324        assert!(msg.contains("Implement Google and GitHub OAuth providers"));
325        assert!(msg.contains("Add session management"));
326    }
327
328    #[test]
329    fn test_xml_extract_with_empty_body() {
330        // Test XML extraction with empty body tags
331        let content = r"<ralph-commit>
332<ralph-subject>fix: resolve bug</ralph-subject>
333<ralph-body></ralph-body>
334</ralph-commit>";
335        let result = try_extract_xml_commit_with_trace(content).0;
336        assert!(result.is_some(), "Should extract even with empty body");
337        // Empty body should be treated as no body
338        assert_eq!(result.unwrap(), "fix: resolve bug");
339    }
340
341    #[test]
342    fn test_xml_extract_ignores_preamble() {
343        // Test that content before <ralph-commit> is ignored
344        let content = r"Here is the commit message based on my analysis:
345
346Looking at the diff, I can see...
347
348<ralph-commit>
349<ralph-subject>refactor: simplify logic</ralph-subject>
350</ralph-commit>
351
352That's all!";
353        let result = try_extract_xml_commit_with_trace(content).0;
354        assert!(result.is_some(), "Should ignore preamble and extract XML");
355        assert_eq!(result.unwrap(), "refactor: simplify logic");
356    }
357
358    #[test]
359    fn test_xml_extract_fails_missing_tags() {
360        // Test that extraction fails when tags are missing
361        let content = "Just some text without XML tags";
362        let result = try_extract_xml_commit_with_trace(content).0;
363        assert!(result.is_none(), "Should fail when XML tags are missing");
364    }
365
366    #[test]
367    fn test_xml_extract_fails_invalid_commit_type() {
368        // Test that extraction fails for invalid conventional commit types
369        let content = r"<ralph-commit>
370<ralph-subject>invalid: not a real type</ralph-subject>
371</ralph-commit>";
372        let result = try_extract_xml_commit_with_trace(content).0;
373        assert!(result.is_none(), "Should reject invalid commit type");
374    }
375
376    #[test]
377    fn test_xml_extract_fails_missing_subject() {
378        // Test that extraction fails when subject is missing
379        let content = r"<ralph-commit>
380<ralph-body>Just a body, no subject</ralph-body>
381</ralph-commit>";
382        let result = try_extract_xml_commit_with_trace(content).0;
383        assert!(result.is_none(), "Should fail when subject is missing");
384    }
385
386    #[test]
387    fn test_xml_extract_fails_empty_subject() {
388        // Test that extraction fails when subject is empty
389        let content = r"<ralph-commit>
390<ralph-subject></ralph-subject>
391</ralph-commit>";
392        let result = try_extract_xml_commit_with_trace(content).0;
393        assert!(result.is_none(), "Should fail when subject is empty");
394    }
395
396    #[test]
397    fn test_xml_extract_handles_whitespace_in_subject() {
398        // Test that whitespace around subject is trimmed
399        let content = r"<ralph-commit>
400<ralph-subject>   docs: update readme   </ralph-subject>
401</ralph-commit>";
402        let result = try_extract_xml_commit_with_trace(content).0;
403        assert!(result.is_some(), "Should handle whitespace in subject");
404        assert_eq!(result.unwrap(), "docs: update readme");
405    }
406
407    #[test]
408    fn test_xml_extract_with_breaking_change() {
409        // Test XML extraction with breaking change indicator
410        let content = r"<ralph-commit>
411<ralph-subject>feat!: drop Python 3.7 support</ralph-subject>
412<ralph-body>BREAKING CHANGE: Minimum Python version is now 3.8.</ralph-body>
413</ralph-commit>";
414        let result = try_extract_xml_commit_with_trace(content).0;
415        assert!(result.is_some(), "Should handle breaking change indicator");
416        let msg = result.unwrap();
417        assert!(msg.starts_with("feat!:"));
418        assert!(msg.contains("BREAKING CHANGE"));
419    }
420
421    #[test]
422    fn test_xml_extract_with_scope() {
423        // Test XML extraction with scope
424        let content = r"<ralph-commit>
425<ralph-subject>test(parser): add coverage for edge cases</ralph-subject>
426</ralph-commit>";
427        let result = try_extract_xml_commit_with_trace(content).0;
428        assert!(result.is_some(), "Should handle scope in subject");
429        assert_eq!(result.unwrap(), "test(parser): add coverage for edge cases");
430    }
431
432    #[test]
433    fn test_xml_extract_body_preserves_newlines() {
434        // Test that newlines in body are preserved
435        let content = r"<ralph-commit>
436<ralph-subject>feat: add feature</ralph-subject>
437<ralph-body>Line 1
438Line 2
439Line 3</ralph-body>
440</ralph-commit>";
441        let result = try_extract_xml_commit_with_trace(content).0;
442        assert!(result.is_some(), "Should preserve newlines in body");
443        let msg = result.unwrap();
444        assert!(msg.contains("Line 1\nLine 2\nLine 3"));
445    }
446
447    #[test]
448    fn test_xml_extract_fails_malformed_tags() {
449        // Test that extraction fails for malformed tags (end before start)
450        let content = r"</ralph-commit>
451<ralph-subject>feat: add feature</ralph-subject>
452<ralph-commit>";
453        let result = try_extract_xml_commit_with_trace(content).0;
454        assert!(result.is_none(), "Should fail for malformed tags");
455    }
456
457    #[test]
458    fn test_xml_extract_handles_markdown_code_fence() {
459        // Test that XML inside markdown code fence is extracted
460        let content = r"```xml
461<ralph-commit>
462<ralph-subject>feat: add feature</ralph-subject>
463</ralph-commit>
464```";
465        // The XML extractor looks for tags directly, so this should still work
466        // since the tags are present in the content
467        let result = try_extract_xml_commit_with_trace(content).0;
468        assert!(
469            result.is_some(),
470            "Should extract from XML even inside code fence"
471        );
472    }
473
474    #[test]
475    fn test_xml_extract_with_thinking_preamble() {
476        // Test that thinking preamble is ignored
477        let log_content = r"[Claude] Thinking: Looking at this diff, I need to analyze...
478
479<ralph-commit>
480<ralph-subject>feat(pipeline): add recovery mechanism</ralph-subject>
481<ralph-body>When commit validation fails, attempt to salvage valid message.</ralph-body>
482</ralph-commit>";
483
484        let (result, _reason) = try_extract_xml_commit_with_trace(log_content);
485        assert!(result.is_some());
486        let msg = result.unwrap();
487        assert!(msg.starts_with("feat(pipeline):"));
488    }
489
490    // Test that validates XSD functionality using the integrated validation
491    #[test]
492    fn test_xsd_validation_integrated_in_extraction() {
493        // The XSD validation is called within try_extract_xml_commit_with_trace
494        // This test ensures that path is exercised
495        let xml = r#"Some text before
496<ralph-commit>
497<ralph-subject>fix: resolve bug</ralph-subject>
498</ralph-commit>
499Some text after"#;
500        let (msg, trace) = try_extract_xml_commit_with_trace(xml);
501        assert!(msg.is_some(), "Should extract valid message");
502        // The trace should contain XSD validation result
503        assert!(trace.contains("XSD"), "Trace should mention XSD validation");
504    }
505}