Skip to main content

ralph_workflow/files/llm_output_extraction/
commit.rs

1//! Commit Message Extraction Functions
2//!
3//! This module provides utilities for extracting commit messages from AI agent output
4//! using XML format with XSD validation.
5
6use super::cleaning::{final_escape_sequence_cleanup, unescape_json_strings_aggressive};
7use super::xml_extraction::extract_xml_commit;
8use super::xsd_validation::validate_xml_against_xsd;
9use crate::common::truncate_text;
10
11/// Result of commit message extraction.
12///
13/// This struct wraps a successfully extracted commit message.
14#[derive(Debug, Clone, PartialEq, Eq)]
15pub struct CommitExtractionResult(String);
16
17impl CommitExtractionResult {
18    /// Create a new extraction result with the given message.
19    pub fn new(message: String) -> Self {
20        Self(message)
21    }
22
23    /// Convert into the inner message string with final escape sequence cleanup.
24    ///
25    /// This applies the final rendering step to ensure no escape sequences leak through
26    /// to the actual commit message.
27    pub fn into_message(self) -> String {
28        render_final_commit_message(&self.0)
29    }
30}
31
32/// Try to extract a commit message from the XML format, with a trace string for debugging.
33///
34/// This uses flexible XML extraction (direct tags, fenced blocks, escaped JSON strings, embedded
35/// text) and validates the resulting XML against the commit XSD.
36pub fn try_extract_xml_commit_with_trace(content: &str) -> (Option<String>, String) {
37    // Try flexible XML extraction that handles various AI embedding patterns.
38    // If extraction fails, use the raw content directly - XSD validation will
39    // provide a clear error message explaining what's wrong (e.g., missing
40    // <ralph-commit> root element) that can be sent back to the AI for retry.
41    let (xml_block, extraction_pattern) = match extract_xml_commit(content) {
42        Some(xml) => {
43            // Detect which extraction pattern was used for logging
44            let pattern = if content.trim().starts_with("<ralph-commit>") {
45                "direct XML"
46            } else if content.contains("```xml") || content.contains("```\n<ralph-commit>") {
47                "markdown code fence"
48            } else if content.contains("{\"result\":") || content.contains("\"result\":") {
49                "JSON string"
50            } else {
51                "embedded search"
52            };
53            (xml, pattern)
54        }
55        None => {
56            // No XML tags found - use raw content and let XSD validation
57            // produce an informative error for the AI to retry
58            (content.to_string(), "raw content (no XML tags found)")
59        }
60    };
61
62    // Run XSD validation - this will catch both malformed XML and missing elements
63    let xsd_result = validate_xml_against_xsd(&xml_block);
64
65    let message = match xsd_result {
66        Ok(elements) => {
67            // Format the commit message using parsed elements
68            let body = elements.format_body();
69            if body.is_empty() {
70                elements.subject.clone()
71            } else {
72                format!("{}\n\n{}", elements.subject, body)
73            }
74        }
75        Err(e) => {
76            // XSD validation failed - return error with details for AI retry
77            let error_msg = e.format_for_ai_retry();
78            return (None, format!("XSD validation failed: {}", error_msg));
79        }
80    };
81
82    // Determine body presence for logging
83    let has_body = message.lines().count() > 1;
84
85    // Use character-based truncation for UTF-8 safety
86    let message_preview = {
87        let escaped = message.replace('\n', "\\n");
88        truncate_text(&escaped, 83) // ~80 chars + "..."
89    };
90
91    (
92        Some(message.clone()),
93        format!(
94            "Found <ralph-commit> via {}, XSD validation passed, body={}, message: '{}'",
95            extraction_pattern,
96            if has_body { "present" } else { "absent" },
97            message_preview
98        ),
99    )
100}
101
102/// Check if a string is a valid conventional commit subject line.
103pub fn is_conventional_commit_subject(subject: &str) -> bool {
104    let valid_types = [
105        "feat", "fix", "docs", "style", "refactor", "perf", "test", "build", "ci", "chore",
106    ];
107
108    // Find the colon
109    let Some(colon_pos) = subject.find(':') else {
110        return false;
111    };
112
113    let prefix = &subject[..colon_pos];
114
115    // Extract type (before optional scope and !)
116    let type_end = prefix
117        .find('(')
118        .unwrap_or_else(|| prefix.find('!').unwrap_or(prefix.len()));
119    let commit_type = &prefix[..type_end];
120
121    valid_types.contains(&commit_type)
122}
123
124// =========================================================================
125// Final Commit Message Rendering
126// =========================================================================
127
128/// Render the final commit message with all cleanup applied.
129///
130/// This is the final step before returning a commit message for use in git commit.
131/// It applies:
132/// 1. Escape sequence cleanup (aggressive unescaping)
133/// 2. Final whitespace cleanup
134///
135/// # Arguments
136///
137/// * `message` - The commit message to render
138///
139/// # Returns
140///
141/// The fully rendered commit message with all escape sequences properly handled.
142pub fn render_final_commit_message(message: &str) -> String {
143    let mut result = message.to_string();
144
145    // Step 1: Apply final escape sequence cleanup
146    // This handles any escape sequences that leaked through the pipeline
147    result = final_escape_sequence_cleanup(&result);
148
149    // Step 2: Try aggressive unescaping if there are still escape sequences
150    if result.contains("\\n") || result.contains("\\t") || result.contains("\\r") {
151        result = unescape_json_strings_aggressive(&result);
152    }
153
154    // Step 3: Final whitespace cleanup
155    result = result
156        .lines()
157        .map(str::trim)
158        .filter(|l| !l.is_empty())
159        .collect::<Vec<_>>()
160        .join("\n");
161
162    result
163}
164
165#[cfg(test)]
166mod tests {
167    use super::*;
168
169    // =========================================================================
170    // Tests for CommitExtractionResult
171    // =========================================================================
172
173    #[test]
174    fn test_commit_extraction_result_into_message() {
175        let result = CommitExtractionResult::new("feat: add feature".to_string());
176        assert_eq!(result.into_message(), "feat: add feature");
177    }
178
179    // =========================================================================
180    // Tests for render_final_commit_message
181    // =========================================================================
182
183    #[test]
184    fn test_render_final_commit_message_with_literal_escapes() {
185        // Test that render_final_commit_message cleans up escape sequences
186        // Note: whitespace cleanup removes blank lines
187        let input = "feat: add feature\n\\n\\nBody with literal escapes";
188        let result = render_final_commit_message(input);
189        assert_eq!(result, "feat: add feature\nBody with literal escapes");
190    }
191
192    #[test]
193    fn test_render_final_commit_message_already_clean() {
194        // Test that already-clean messages pass through (whitespace cleanup applied)
195        let input = "feat: add feature\n\nBody text here";
196        let result = render_final_commit_message(input);
197        assert_eq!(result, "feat: add feature\nBody text here");
198    }
199
200    #[test]
201    fn test_render_final_commit_message_with_tabs() {
202        // Test that tab escapes are properly handled
203        let input = "feat: add feature\\n\\t- item 1\\n\\t- item 2";
204        let result = render_final_commit_message(input);
205        // Tabs are stripped by whitespace cleanup (trim() removes leading whitespace)
206        assert_eq!(result, "feat: add feature\n- item 1\n- item 2");
207    }
208
209    #[test]
210    fn test_render_final_commit_message_with_carriage_returns() {
211        // Test that carriage return escapes are properly handled
212        let input = "feat: add feature\\r\\nBody text";
213        let result = render_final_commit_message(input);
214        // Carriage returns are converted, but whitespace cleanup removes extra blank lines
215        assert_eq!(result, "feat: add feature\nBody text");
216    }
217
218    #[test]
219    fn test_render_final_commit_message_whitespace_cleanup() {
220        // Test that trailing empty lines are removed
221        let input = "feat: add feature\n\nBody text\n\n\n  \n  ";
222        let result = render_final_commit_message(input);
223        assert_eq!(result, "feat: add feature\nBody text");
224    }
225
226    #[test]
227    fn test_render_final_commit_message_mixed_escape_sequences() {
228        // Test handling of mixed escape sequences
229        let input = "feat: add feature\\n\\nDetails:\\r\\n\\t- item 1\\n\\t- item 2";
230        let result = render_final_commit_message(input);
231        // Carriage returns normalized to newlines, tabs stripped by trim, blank lines removed
232        assert_eq!(result, "feat: add feature\nDetails:\n- item 1\n- item 2");
233    }
234
235    // =========================================================================
236    // Tests for is_conventional_commit_subject
237    // =========================================================================
238
239    #[test]
240    fn test_conventional_commit_subject_valid() {
241        assert!(is_conventional_commit_subject("feat: add feature"));
242        assert!(is_conventional_commit_subject("fix: resolve bug"));
243        assert!(is_conventional_commit_subject("docs: update readme"));
244        assert!(is_conventional_commit_subject(
245            "refactor(core): simplify logic"
246        ));
247        assert!(is_conventional_commit_subject("feat!: breaking change"));
248        assert!(is_conventional_commit_subject("fix(api)!: breaking fix"));
249    }
250
251    #[test]
252    fn test_conventional_commit_subject_invalid() {
253        assert!(!is_conventional_commit_subject("invalid: not a type"));
254        assert!(!is_conventional_commit_subject("no colon here"));
255        assert!(!is_conventional_commit_subject(""));
256        assert!(!is_conventional_commit_subject("Feature: capitalize"));
257    }
258
259    // =========================================================================
260    // Tests for XML extraction (try_extract_xml_commit_with_trace)
261    // =========================================================================
262
263    #[test]
264    fn test_xml_extract_basic_subject_only() {
265        // Test basic XML extraction with subject only
266        let content = r"<ralph-commit>
267<ralph-subject>feat: add new feature</ralph-subject>
268</ralph-commit>";
269        let (result, reason) = try_extract_xml_commit_with_trace(content);
270        assert!(
271            result.is_some(),
272            "Should extract from basic XML. Reason: {}",
273            reason
274        );
275        assert_eq!(result.unwrap(), "feat: add new feature");
276    }
277
278    #[test]
279    fn test_xml_extract_with_body() {
280        // Test XML extraction with subject and body
281        let content = r"<ralph-commit>
282<ralph-subject>feat(auth): add OAuth2 login flow</ralph-subject>
283<ralph-body>Implement Google and GitHub OAuth providers.
284Add session management for OAuth tokens.</ralph-body>
285</ralph-commit>";
286        let result = try_extract_xml_commit_with_trace(content).0;
287        assert!(result.is_some(), "Should extract from XML with body");
288        let msg = result.unwrap();
289        assert!(msg.starts_with("feat(auth): add OAuth2 login flow"));
290        assert!(msg.contains("Implement Google and GitHub OAuth providers"));
291        assert!(msg.contains("Add session management"));
292    }
293
294    #[test]
295    fn test_xml_extract_with_empty_body() {
296        // Test XML extraction with empty body tags
297        let content = r"<ralph-commit>
298<ralph-subject>fix: resolve bug</ralph-subject>
299<ralph-body></ralph-body>
300</ralph-commit>";
301        let result = try_extract_xml_commit_with_trace(content).0;
302        assert!(result.is_some(), "Should extract even with empty body");
303        // Empty body should be treated as no body
304        assert_eq!(result.unwrap(), "fix: resolve bug");
305    }
306
307    #[test]
308    fn test_xml_extract_ignores_preamble() {
309        // Test that content before <ralph-commit> is ignored
310        let content = r"Here is the commit message based on my analysis:
311
312Looking at the diff, I can see...
313
314<ralph-commit>
315<ralph-subject>refactor: simplify logic</ralph-subject>
316</ralph-commit>
317
318That's all!";
319        let result = try_extract_xml_commit_with_trace(content).0;
320        assert!(result.is_some(), "Should ignore preamble and extract XML");
321        assert_eq!(result.unwrap(), "refactor: simplify logic");
322    }
323
324    #[test]
325    fn test_xml_extract_fails_missing_tags() {
326        // Test that extraction fails when tags are missing
327        let content = "Just some text without XML tags";
328        let result = try_extract_xml_commit_with_trace(content).0;
329        assert!(result.is_none(), "Should fail when XML tags are missing");
330    }
331
332    #[test]
333    fn test_xml_extract_fails_invalid_commit_type() {
334        // Test that extraction fails for invalid conventional commit types
335        let content = r"<ralph-commit>
336<ralph-subject>invalid: not a real type</ralph-subject>
337</ralph-commit>";
338        let result = try_extract_xml_commit_with_trace(content).0;
339        assert!(result.is_none(), "Should reject invalid commit type");
340    }
341
342    #[test]
343    fn test_xml_extract_fails_missing_subject() {
344        // Test that extraction fails when subject is missing
345        let content = r"<ralph-commit>
346<ralph-body>Just a body, no subject</ralph-body>
347</ralph-commit>";
348        let result = try_extract_xml_commit_with_trace(content).0;
349        assert!(result.is_none(), "Should fail when subject is missing");
350    }
351
352    #[test]
353    fn test_xml_extract_fails_empty_subject() {
354        // Test that extraction fails when subject is empty
355        let content = r"<ralph-commit>
356<ralph-subject></ralph-subject>
357</ralph-commit>";
358        let result = try_extract_xml_commit_with_trace(content).0;
359        assert!(result.is_none(), "Should fail when subject is empty");
360    }
361
362    #[test]
363    fn test_xml_extract_handles_whitespace_in_subject() {
364        // Test that whitespace around subject is trimmed
365        let content = r"<ralph-commit>
366<ralph-subject>   docs: update readme   </ralph-subject>
367</ralph-commit>";
368        let result = try_extract_xml_commit_with_trace(content).0;
369        assert!(result.is_some(), "Should handle whitespace in subject");
370        assert_eq!(result.unwrap(), "docs: update readme");
371    }
372
373    #[test]
374    fn test_xml_extract_with_breaking_change() {
375        // Test XML extraction with breaking change indicator
376        let content = r"<ralph-commit>
377<ralph-subject>feat!: drop Python 3.7 support</ralph-subject>
378<ralph-body>BREAKING CHANGE: Minimum Python version is now 3.8.</ralph-body>
379</ralph-commit>";
380        let result = try_extract_xml_commit_with_trace(content).0;
381        assert!(result.is_some(), "Should handle breaking change indicator");
382        let msg = result.unwrap();
383        assert!(msg.starts_with("feat!:"));
384        assert!(msg.contains("BREAKING CHANGE"));
385    }
386
387    #[test]
388    fn test_xml_extract_with_scope() {
389        // Test XML extraction with scope
390        let content = r"<ralph-commit>
391<ralph-subject>test(parser): add coverage for edge cases</ralph-subject>
392</ralph-commit>";
393        let result = try_extract_xml_commit_with_trace(content).0;
394        assert!(result.is_some(), "Should handle scope in subject");
395        assert_eq!(result.unwrap(), "test(parser): add coverage for edge cases");
396    }
397
398    #[test]
399    fn test_xml_extract_body_preserves_newlines() {
400        // Test that newlines in body are preserved
401        let content = r"<ralph-commit>
402<ralph-subject>feat: add feature</ralph-subject>
403<ralph-body>Line 1
404Line 2
405Line 3</ralph-body>
406</ralph-commit>";
407        let result = try_extract_xml_commit_with_trace(content).0;
408        assert!(result.is_some(), "Should preserve newlines in body");
409        let msg = result.unwrap();
410        assert!(msg.contains("Line 1\nLine 2\nLine 3"));
411    }
412
413    #[test]
414    fn test_xml_extract_fails_malformed_tags() {
415        // Test that extraction fails for malformed tags (end before start)
416        let content = r"</ralph-commit>
417<ralph-subject>feat: add feature</ralph-subject>
418<ralph-commit>";
419        let result = try_extract_xml_commit_with_trace(content).0;
420        assert!(result.is_none(), "Should fail for malformed tags");
421    }
422
423    #[test]
424    fn test_xml_extract_handles_markdown_code_fence() {
425        // Test that XML inside markdown code fence is extracted
426        let content = r"```xml
427<ralph-commit>
428<ralph-subject>feat: add feature</ralph-subject>
429</ralph-commit>
430```";
431        // The XML extractor looks for tags directly, so this should still work
432        // since the tags are present in the content
433        let result = try_extract_xml_commit_with_trace(content).0;
434        assert!(
435            result.is_some(),
436            "Should extract from XML even inside code fence"
437        );
438    }
439
440    #[test]
441    fn test_xml_extract_with_thinking_preamble() {
442        // Test that thinking preamble is ignored
443        let log_content = r"[Claude] Thinking: Looking at this diff, I need to analyze...
444
445<ralph-commit>
446<ralph-subject>feat(pipeline): add recovery mechanism</ralph-subject>
447<ralph-body>When commit validation fails, attempt to salvage valid message.</ralph-body>
448</ralph-commit>";
449
450        let (result, _reason) = try_extract_xml_commit_with_trace(log_content);
451        assert!(result.is_some());
452        let msg = result.unwrap();
453        assert!(msg.starts_with("feat(pipeline):"));
454    }
455
456    // Test that validates XSD functionality using the integrated validation
457    #[test]
458    fn test_xsd_validation_integrated_in_extraction() {
459        // The XSD validation is called within try_extract_xml_commit_with_trace
460        // This test ensures that path is exercised
461        let xml = r#"Some text before
462<ralph-commit>
463<ralph-subject>fix: resolve bug</ralph-subject>
464</ralph-commit>
465Some text after"#;
466        let (msg, trace) = try_extract_xml_commit_with_trace(xml);
467        assert!(msg.is_some(), "Should extract valid message");
468        // The trace should contain XSD validation result
469        assert!(trace.contains("XSD"), "Trace should mention XSD validation");
470    }
471}