Skip to main content

ralph_workflow/files/llm_output_extraction/
commit.rs

1//! Commit Message Extraction Functions
2//!
3//! This module provides utilities for extracting commit messages from AI agent output
4//! using XML format with XSD validation.
5
6use super::cleaning::{final_escape_sequence_cleanup, unescape_json_strings_aggressive};
7use super::xml_extraction::extract_xml_commit;
8use super::xsd_validation::validate_xml_against_xsd;
9use crate::common::truncate_text;
10
11/// Result of commit message extraction.
12///
13/// This struct wraps a successfully extracted commit message.
14#[derive(Debug, Clone, PartialEq, Eq)]
15pub struct CommitExtractionResult(String);
16
17impl CommitExtractionResult {
18    /// Create a new extraction result with the given message.
19    pub fn new(message: String) -> Self {
20        Self(message)
21    }
22
23    /// Convert into the inner message string with final escape sequence cleanup.
24    ///
25    /// This applies the final rendering step to ensure no escape sequences leak through
26    /// to the actual commit message.
27    pub fn into_message(self) -> String {
28        render_final_commit_message(&self.0)
29    }
30}
31
32/// Try to extract commit message from XML format with detailed tracing.
33///
34/// This function uses flexible XML extraction to handle various AI embedding patterns:
35/// - Direct XML tags at content start
36/// - XML in markdown code fences (```xml, ```)
37/// - XML in JSON strings (escaped)
38/// - XML embedded within analysis text
39///
40/// The XML format is preferred because:
41/// - No escape sequence issues (actual newlines work fine)
42/// - Distinctive tags unlikely to appear in LLM analysis text
43/// - Clear boundaries for parsing
44///
45/// # Expected Format
46///
47/// ```xml
48/// <ralph-commit>
49/// <ralph-subject>type(scope): description</ralph-subject>
50/// <ralph-body>Optional body text here.
51/// Can span multiple lines.</ralph-body>
52/// </ralph-commit>
53/// ```
54///
55/// Or with detailed body elements:
56///
57/// ```xml
58/// <ralph-commit>
59/// <ralph-subject>type(scope): description</ralph-subject>
60/// <ralph-body-summary>Brief summary</ralph-body-summary>
61/// <ralph-body-details>Detailed bullet points</ralph-body-details>
62/// <ralph-body-footer>BREAKING CHANGE or Fixes #123</ralph-body-footer>
63/// </ralph-commit>
64/// ```
65///
66/// The `<ralph-body>` tag is optional and may be omitted for commits without a body.
67///
68/// # Returns
69///
70/// A tuple of `(Option<String>, String)`:
71/// - First element: `Some(message)` if valid XML with a valid conventional commit subject was found, `None` otherwise
72/// - Second element: Detailed reason string explaining what was found/not found (for debugging)
73pub fn try_extract_xml_commit_with_trace(content: &str) -> (Option<String>, String) {
74    // Try flexible XML extraction that handles various AI embedding patterns.
75    // If extraction fails, use the raw content directly - XSD validation will
76    // provide a clear error message explaining what's wrong (e.g., missing
77    // <ralph-commit> root element) that can be sent back to the AI for retry.
78    let (xml_block, extraction_pattern) = match extract_xml_commit(content) {
79        Some(xml) => {
80            // Detect which extraction pattern was used for logging
81            let pattern = if content.trim().starts_with("<ralph-commit>") {
82                "direct XML"
83            } else if content.contains("```xml") || content.contains("```\n<ralph-commit>") {
84                "markdown code fence"
85            } else if content.contains("{\"result\":") || content.contains("\"result\":") {
86                "JSON string"
87            } else {
88                "embedded search"
89            };
90            (xml, pattern)
91        }
92        None => {
93            // No XML tags found - use raw content and let XSD validation
94            // produce an informative error for the AI to retry
95            (content.to_string(), "raw content (no XML tags found)")
96        }
97    };
98
99    // Run XSD validation - this will catch both malformed XML and missing elements
100    let xsd_result = validate_xml_against_xsd(&xml_block);
101
102    let message = match xsd_result {
103        Ok(elements) => {
104            // Format the commit message using parsed elements
105            let body = elements.format_body();
106            if body.is_empty() {
107                elements.subject.clone()
108            } else {
109                format!("{}\n\n{}", elements.subject, body)
110            }
111        }
112        Err(e) => {
113            // XSD validation failed - return error with details for AI retry
114            let error_msg = e.format_for_ai_retry();
115            return (None, format!("XSD validation failed: {}", error_msg));
116        }
117    };
118
119    // Determine body presence for logging
120    let has_body = message.lines().count() > 1;
121
122    // Use character-based truncation for UTF-8 safety
123    let message_preview = {
124        let escaped = message.replace('\n', "\\n");
125        truncate_text(&escaped, 83) // ~80 chars + "..."
126    };
127
128    (
129        Some(message.clone()),
130        format!(
131            "Found <ralph-commit> via {}, XSD validation passed, body={}, message: '{}'",
132            extraction_pattern,
133            if has_body { "present" } else { "absent" },
134            message_preview
135        ),
136    )
137}
138
139/// Check if a string is a valid conventional commit subject line.
140pub fn is_conventional_commit_subject(subject: &str) -> bool {
141    let valid_types = [
142        "feat", "fix", "docs", "style", "refactor", "perf", "test", "build", "ci", "chore",
143    ];
144
145    // Find the colon
146    let Some(colon_pos) = subject.find(':') else {
147        return false;
148    };
149
150    let prefix = &subject[..colon_pos];
151
152    // Extract type (before optional scope and !)
153    let type_end = prefix
154        .find('(')
155        .unwrap_or_else(|| prefix.find('!').unwrap_or(prefix.len()));
156    let commit_type = &prefix[..type_end];
157
158    valid_types.contains(&commit_type)
159}
160
161// =========================================================================
162// Final Commit Message Rendering
163// =========================================================================
164
165/// Render the final commit message with all cleanup applied.
166///
167/// This is the final step before returning a commit message for use in git commit.
168/// It applies:
169/// 1. Escape sequence cleanup (aggressive unescaping)
170/// 2. Final whitespace cleanup
171///
172/// # Arguments
173///
174/// * `message` - The commit message to render
175///
176/// # Returns
177///
178/// The fully rendered commit message with all escape sequences properly handled.
179pub fn render_final_commit_message(message: &str) -> String {
180    let mut result = message.to_string();
181
182    // Step 1: Apply final escape sequence cleanup
183    // This handles any escape sequences that leaked through the pipeline
184    result = final_escape_sequence_cleanup(&result);
185
186    // Step 2: Try aggressive unescaping if there are still escape sequences
187    if result.contains("\\n") || result.contains("\\t") || result.contains("\\r") {
188        result = unescape_json_strings_aggressive(&result);
189    }
190
191    // Step 3: Final whitespace cleanup
192    result = result
193        .lines()
194        .map(str::trim)
195        .filter(|l| !l.is_empty())
196        .collect::<Vec<_>>()
197        .join("\n");
198
199    result
200}
201
202#[cfg(test)]
203mod tests {
204    use super::*;
205
206    // =========================================================================
207    // Tests for CommitExtractionResult
208    // =========================================================================
209
210    #[test]
211    fn test_commit_extraction_result_into_message() {
212        let result = CommitExtractionResult::new("feat: add feature".to_string());
213        assert_eq!(result.into_message(), "feat: add feature");
214    }
215
216    // =========================================================================
217    // Tests for render_final_commit_message
218    // =========================================================================
219
220    #[test]
221    fn test_render_final_commit_message_with_literal_escapes() {
222        // Test that render_final_commit_message cleans up escape sequences
223        // Note: whitespace cleanup removes blank lines
224        let input = "feat: add feature\n\\n\\nBody with literal escapes";
225        let result = render_final_commit_message(input);
226        assert_eq!(result, "feat: add feature\nBody with literal escapes");
227    }
228
229    #[test]
230    fn test_render_final_commit_message_already_clean() {
231        // Test that already-clean messages pass through (whitespace cleanup applied)
232        let input = "feat: add feature\n\nBody text here";
233        let result = render_final_commit_message(input);
234        assert_eq!(result, "feat: add feature\nBody text here");
235    }
236
237    #[test]
238    fn test_render_final_commit_message_with_tabs() {
239        // Test that tab escapes are properly handled
240        let input = "feat: add feature\\n\\t- item 1\\n\\t- item 2";
241        let result = render_final_commit_message(input);
242        // Tabs are stripped by whitespace cleanup (trim() removes leading whitespace)
243        assert_eq!(result, "feat: add feature\n- item 1\n- item 2");
244    }
245
246    #[test]
247    fn test_render_final_commit_message_with_carriage_returns() {
248        // Test that carriage return escapes are properly handled
249        let input = "feat: add feature\\r\\nBody text";
250        let result = render_final_commit_message(input);
251        // Carriage returns are converted, but whitespace cleanup removes extra blank lines
252        assert_eq!(result, "feat: add feature\nBody text");
253    }
254
255    #[test]
256    fn test_render_final_commit_message_whitespace_cleanup() {
257        // Test that trailing empty lines are removed
258        let input = "feat: add feature\n\nBody text\n\n\n  \n  ";
259        let result = render_final_commit_message(input);
260        assert_eq!(result, "feat: add feature\nBody text");
261    }
262
263    #[test]
264    fn test_render_final_commit_message_mixed_escape_sequences() {
265        // Test handling of mixed escape sequences
266        let input = "feat: add feature\\n\\nDetails:\\r\\n\\t- item 1\\n\\t- item 2";
267        let result = render_final_commit_message(input);
268        // Carriage returns normalized to newlines, tabs stripped by trim, blank lines removed
269        assert_eq!(result, "feat: add feature\nDetails:\n- item 1\n- item 2");
270    }
271
272    // =========================================================================
273    // Tests for is_conventional_commit_subject
274    // =========================================================================
275
276    #[test]
277    fn test_conventional_commit_subject_valid() {
278        assert!(is_conventional_commit_subject("feat: add feature"));
279        assert!(is_conventional_commit_subject("fix: resolve bug"));
280        assert!(is_conventional_commit_subject("docs: update readme"));
281        assert!(is_conventional_commit_subject(
282            "refactor(core): simplify logic"
283        ));
284        assert!(is_conventional_commit_subject("feat!: breaking change"));
285        assert!(is_conventional_commit_subject("fix(api)!: breaking fix"));
286    }
287
288    #[test]
289    fn test_conventional_commit_subject_invalid() {
290        assert!(!is_conventional_commit_subject("invalid: not a type"));
291        assert!(!is_conventional_commit_subject("no colon here"));
292        assert!(!is_conventional_commit_subject(""));
293        assert!(!is_conventional_commit_subject("Feature: capitalize"));
294    }
295
296    // =========================================================================
297    // Tests for XML extraction (try_extract_xml_commit_with_trace)
298    // =========================================================================
299
300    #[test]
301    fn test_xml_extract_basic_subject_only() {
302        // Test basic XML extraction with subject only
303        let content = r"<ralph-commit>
304<ralph-subject>feat: add new feature</ralph-subject>
305</ralph-commit>";
306        let (result, reason) = try_extract_xml_commit_with_trace(content);
307        assert!(
308            result.is_some(),
309            "Should extract from basic XML. Reason: {}",
310            reason
311        );
312        assert_eq!(result.unwrap(), "feat: add new feature");
313    }
314
315    #[test]
316    fn test_xml_extract_with_body() {
317        // Test XML extraction with subject and body
318        let content = r"<ralph-commit>
319<ralph-subject>feat(auth): add OAuth2 login flow</ralph-subject>
320<ralph-body>Implement Google and GitHub OAuth providers.
321Add session management for OAuth tokens.</ralph-body>
322</ralph-commit>";
323        let result = try_extract_xml_commit_with_trace(content).0;
324        assert!(result.is_some(), "Should extract from XML with body");
325        let msg = result.unwrap();
326        assert!(msg.starts_with("feat(auth): add OAuth2 login flow"));
327        assert!(msg.contains("Implement Google and GitHub OAuth providers"));
328        assert!(msg.contains("Add session management"));
329    }
330
331    #[test]
332    fn test_xml_extract_with_empty_body() {
333        // Test XML extraction with empty body tags
334        let content = r"<ralph-commit>
335<ralph-subject>fix: resolve bug</ralph-subject>
336<ralph-body></ralph-body>
337</ralph-commit>";
338        let result = try_extract_xml_commit_with_trace(content).0;
339        assert!(result.is_some(), "Should extract even with empty body");
340        // Empty body should be treated as no body
341        assert_eq!(result.unwrap(), "fix: resolve bug");
342    }
343
344    #[test]
345    fn test_xml_extract_ignores_preamble() {
346        // Test that content before <ralph-commit> is ignored
347        let content = r"Here is the commit message based on my analysis:
348
349Looking at the diff, I can see...
350
351<ralph-commit>
352<ralph-subject>refactor: simplify logic</ralph-subject>
353</ralph-commit>
354
355That's all!";
356        let result = try_extract_xml_commit_with_trace(content).0;
357        assert!(result.is_some(), "Should ignore preamble and extract XML");
358        assert_eq!(result.unwrap(), "refactor: simplify logic");
359    }
360
361    #[test]
362    fn test_xml_extract_fails_missing_tags() {
363        // Test that extraction fails when tags are missing
364        let content = "Just some text without XML tags";
365        let result = try_extract_xml_commit_with_trace(content).0;
366        assert!(result.is_none(), "Should fail when XML tags are missing");
367    }
368
369    #[test]
370    fn test_xml_extract_fails_invalid_commit_type() {
371        // Test that extraction fails for invalid conventional commit types
372        let content = r"<ralph-commit>
373<ralph-subject>invalid: not a real type</ralph-subject>
374</ralph-commit>";
375        let result = try_extract_xml_commit_with_trace(content).0;
376        assert!(result.is_none(), "Should reject invalid commit type");
377    }
378
379    #[test]
380    fn test_xml_extract_fails_missing_subject() {
381        // Test that extraction fails when subject is missing
382        let content = r"<ralph-commit>
383<ralph-body>Just a body, no subject</ralph-body>
384</ralph-commit>";
385        let result = try_extract_xml_commit_with_trace(content).0;
386        assert!(result.is_none(), "Should fail when subject is missing");
387    }
388
389    #[test]
390    fn test_xml_extract_fails_empty_subject() {
391        // Test that extraction fails when subject is empty
392        let content = r"<ralph-commit>
393<ralph-subject></ralph-subject>
394</ralph-commit>";
395        let result = try_extract_xml_commit_with_trace(content).0;
396        assert!(result.is_none(), "Should fail when subject is empty");
397    }
398
399    #[test]
400    fn test_xml_extract_handles_whitespace_in_subject() {
401        // Test that whitespace around subject is trimmed
402        let content = r"<ralph-commit>
403<ralph-subject>   docs: update readme   </ralph-subject>
404</ralph-commit>";
405        let result = try_extract_xml_commit_with_trace(content).0;
406        assert!(result.is_some(), "Should handle whitespace in subject");
407        assert_eq!(result.unwrap(), "docs: update readme");
408    }
409
410    #[test]
411    fn test_xml_extract_with_breaking_change() {
412        // Test XML extraction with breaking change indicator
413        let content = r"<ralph-commit>
414<ralph-subject>feat!: drop Python 3.7 support</ralph-subject>
415<ralph-body>BREAKING CHANGE: Minimum Python version is now 3.8.</ralph-body>
416</ralph-commit>";
417        let result = try_extract_xml_commit_with_trace(content).0;
418        assert!(result.is_some(), "Should handle breaking change indicator");
419        let msg = result.unwrap();
420        assert!(msg.starts_with("feat!:"));
421        assert!(msg.contains("BREAKING CHANGE"));
422    }
423
424    #[test]
425    fn test_xml_extract_with_scope() {
426        // Test XML extraction with scope
427        let content = r"<ralph-commit>
428<ralph-subject>test(parser): add coverage for edge cases</ralph-subject>
429</ralph-commit>";
430        let result = try_extract_xml_commit_with_trace(content).0;
431        assert!(result.is_some(), "Should handle scope in subject");
432        assert_eq!(result.unwrap(), "test(parser): add coverage for edge cases");
433    }
434
435    #[test]
436    fn test_xml_extract_body_preserves_newlines() {
437        // Test that newlines in body are preserved
438        let content = r"<ralph-commit>
439<ralph-subject>feat: add feature</ralph-subject>
440<ralph-body>Line 1
441Line 2
442Line 3</ralph-body>
443</ralph-commit>";
444        let result = try_extract_xml_commit_with_trace(content).0;
445        assert!(result.is_some(), "Should preserve newlines in body");
446        let msg = result.unwrap();
447        assert!(msg.contains("Line 1\nLine 2\nLine 3"));
448    }
449
450    #[test]
451    fn test_xml_extract_fails_malformed_tags() {
452        // Test that extraction fails for malformed tags (end before start)
453        let content = r"</ralph-commit>
454<ralph-subject>feat: add feature</ralph-subject>
455<ralph-commit>";
456        let result = try_extract_xml_commit_with_trace(content).0;
457        assert!(result.is_none(), "Should fail for malformed tags");
458    }
459
460    #[test]
461    fn test_xml_extract_handles_markdown_code_fence() {
462        // Test that XML inside markdown code fence is extracted
463        let content = r"```xml
464<ralph-commit>
465<ralph-subject>feat: add feature</ralph-subject>
466</ralph-commit>
467```";
468        // The XML extractor looks for tags directly, so this should still work
469        // since the tags are present in the content
470        let result = try_extract_xml_commit_with_trace(content).0;
471        assert!(
472            result.is_some(),
473            "Should extract from XML even inside code fence"
474        );
475    }
476
477    #[test]
478    fn test_xml_extract_with_thinking_preamble() {
479        // Test that thinking preamble is ignored
480        let log_content = r"[Claude] Thinking: Looking at this diff, I need to analyze...
481
482<ralph-commit>
483<ralph-subject>feat(pipeline): add recovery mechanism</ralph-subject>
484<ralph-body>When commit validation fails, attempt to salvage valid message.</ralph-body>
485</ralph-commit>";
486
487        let (result, _reason) = try_extract_xml_commit_with_trace(log_content);
488        assert!(result.is_some());
489        let msg = result.unwrap();
490        assert!(msg.starts_with("feat(pipeline):"));
491    }
492
493    // Test that validates XSD functionality using the integrated validation
494    #[test]
495    fn test_xsd_validation_integrated_in_extraction() {
496        // The XSD validation is called within try_extract_xml_commit_with_trace
497        // This test ensures that path is exercised
498        let xml = r#"Some text before
499<ralph-commit>
500<ralph-subject>fix: resolve bug</ralph-subject>
501</ralph-commit>
502Some text after"#;
503        let (msg, trace) = try_extract_xml_commit_with_trace(xml);
504        assert!(msg.is_some(), "Should extract valid message");
505        // The trace should contain XSD validation result
506        assert!(trace.contains("XSD"), "Trace should mention XSD validation");
507    }
508}