ralph_workflow/files/llm_output_extraction/
commit.rs

1//! Commit Message Validation and Recovery Functions
2//!
3//! This module provides utilities for validating commit messages, salvaging
4//! valid messages from mixed output, and generating fallback messages from
5//! git diff metadata.
6
7use regex::Regex;
8use serde::Deserialize;
9use std::sync::OnceLock;
10
11use super::cleaning::{
12    final_escape_sequence_cleanup, find_conventional_commit_start, looks_like_analysis_text,
13    unescape_json_strings, unescape_json_strings_aggressive,
14};
15use crate::agents::AgentErrorKind;
16
17/// Detect if agent output contains unrecoverable errors that should trigger fallback.
18///
19/// This handles cases where agents output errors in their result field instead of stderr.
20/// Patterns include: "Prompt is too long", "token limit exceeded", etc.
21///
22/// Enhanced with more patterns to catch additional error variations from different agents.
23pub fn detect_agent_errors_in_output(content: &str) -> Option<AgentErrorKind> {
24    let content_lower = content.to_lowercase();
25
26    // Check for token/context exhaustion patterns in output
27    // These patterns indicate the prompt was too large for the agent's context window
28    if content_lower.contains("prompt is too long")
29        || content_lower.contains("token limit exceeded")
30        || content_lower.contains("context length exceeded")
31        || content_lower.contains("maximum context")
32        || content_lower.contains("input too large")
33        || content_lower.contains("context window")
34        || content_lower.contains("max tokens")
35        || content_lower.contains("token limit")
36        || content_lower.contains("too many tokens")
37        || content_lower.contains("exceeds context")
38        || content_lower.contains("model's context length")
39        || content_lower.contains("input exceeds")
40    {
41        return Some(AgentErrorKind::TokenExhausted);
42    }
43
44    // Check for agent failure patterns
45    // These indicate other types of agent errors (API issues, invalid requests, etc.)
46    if content_lower.contains("invalid request")
47        || content_lower.contains("request failed")
48        || content_lower.contains("api error")
49        || content_lower.contains("rate limit")
50        || content_lower.contains("service unavailable")
51    {
52        return Some(AgentErrorKind::InvalidResponse);
53    }
54
55    None
56}
57
58/// Result of commit message extraction with detail about the extraction method.
59///
60/// This enum allows callers to distinguish between different extraction outcomes
61/// and take appropriate action (e.g., re-prompt when receiving a Fallback result).
62#[derive(Debug, Clone, PartialEq, Eq)]
63pub enum CommitExtractionResult {
64    /// Successfully extracted from structured agent output (JSON schema or pattern-based)
65    Extracted(String),
66    /// Recovered via salvage mechanism (found conventional commit within mixed output)
67    Salvaged(String),
68    /// Using deterministic fallback generated from diff metadata
69    Fallback(String),
70    /// Agent error detected in output (should trigger fallback)
71    AgentError(AgentErrorKind),
72}
73
74impl CommitExtractionResult {
75    /// Convert into the inner message string with final escape sequence cleanup.
76    ///
77    /// This applies the final rendering step to ensure no escape sequences leak through
78    /// to the actual commit message.
79    pub fn into_message(self) -> String {
80        match self {
81            Self::Extracted(msg) | Self::Salvaged(msg) | Self::Fallback(msg) => {
82                render_final_commit_message(&msg)
83            }
84            Self::AgentError(_) => String::new(), // Errors produce empty message
85        }
86    }
87
88    /// Check if this was a fallback result (should trigger re-prompt).
89    pub const fn is_fallback(&self) -> bool {
90        matches!(self, Self::Fallback(_))
91    }
92
93    /// Check if this was an agent error (should trigger fallback immediately).
94    pub const fn is_agent_error(&self) -> bool {
95        matches!(self, Self::AgentError(_))
96    }
97
98    /// Get the error kind if this is an agent error.
99    pub const fn error_kind(&self) -> Option<AgentErrorKind> {
100        match self {
101            Self::AgentError(kind) => Some(*kind),
102            _ => None,
103        }
104    }
105}
106
107/// Structured commit message schema for JSON parsing.
108#[derive(Debug, Deserialize)]
109struct StructuredCommitMessage {
110    subject: String,
111    body: Option<String>,
112}
113
114/// Try to extract commit message from XML format with detailed tracing.
115///
116/// This function looks for the distinctive `<ralph-commit>` tags used by our
117/// XML-based commit prompt template. The XML format is preferred because:
118/// - No escape sequence issues (actual newlines work fine)
119/// - Distinctive tags unlikely to appear in LLM analysis text
120/// - Clear boundaries for parsing
121///
122/// # Expected Format
123///
124/// ```xml
125/// <ralph-commit>
126/// <ralph-subject>type(scope): description</ralph-subject>
127/// <ralph-body>Optional body text here.
128/// Can span multiple lines.</ralph-body>
129/// </ralph-commit>
130/// ```
131///
132/// The `<ralph-body>` tag is optional and may be omitted for commits without a body.
133///
134/// # Returns
135///
136/// A tuple of `(Option<String>, String)`:
137/// - First element: `Some(message)` if valid XML with a valid conventional commit subject was found, `None` otherwise
138/// - Second element: Detailed reason string explaining what was found/not found (for debugging)
139pub fn try_extract_xml_commit_with_trace(content: &str) -> (Option<String>, String) {
140    let content_len = content.len();
141    let content_preview = if content_len > 50 {
142        format!("{}...", &content[..50].replace('\n', "\\n"))
143    } else {
144        content.replace('\n', "\\n")
145    };
146
147    // Find the <ralph-commit> block
148    let Some(commit_start) = content.find("<ralph-commit>") else {
149        return (
150            None,
151            format!(
152                "No <ralph-commit> tag found (content length: {content_len}, starts with: '{content_preview}')"
153            ),
154        );
155    };
156
157    let Some(commit_end) = content.find("</ralph-commit>") else {
158        return (
159            None,
160            format!(
161                "Found <ralph-commit> at pos {commit_start}, but no closing </ralph-commit> tag"
162            ),
163        );
164    };
165
166    if commit_start >= commit_end {
167        return (
168            None,
169            format!(
170                "Malformed XML: </ralph-commit> at {commit_end} appears before <ralph-commit> at {commit_start}"
171            ),
172        );
173    }
174
175    // Extract content between the tags
176    let commit_block = &content[commit_start + "<ralph-commit>".len()..commit_end];
177
178    // Extract subject (required)
179    let Some(subject) = extract_xml_tag_content(commit_block, "ralph-subject") else {
180        return (
181            None,
182            format!(
183                "Found <ralph-commit> at {commit_start}, but <ralph-subject> tag not found within commit block"
184            ),
185        );
186    };
187
188    let subject = subject.trim();
189    if subject.is_empty() {
190        return (
191            None,
192            format!("Found <ralph-subject> but it is empty (at pos {commit_start})"),
193        );
194    }
195
196    // Validate conventional commit format
197    if !is_conventional_commit_subject(subject) {
198        return (
199            None,
200            format!(
201                "Found subject '{}' but it doesn't match conventional commit format (type: ...)",
202                if subject.len() > 50 {
203                    format!("{}...", &subject[..50])
204                } else {
205                    subject.to_string()
206                }
207            ),
208        );
209    }
210
211    // Extract body (optional)
212    let body = extract_xml_tag_content(commit_block, "ralph-body");
213
214    // Format the commit message
215    let has_body = body.as_ref().is_some_and(|b| !b.trim().is_empty());
216    let message = match &body {
217        Some(body_content) if !body_content.trim().is_empty() => {
218            format!("{}\n\n{}", subject, body_content.trim())
219        }
220        _ => subject.to_string(),
221    };
222    (
223        Some(message.clone()),
224        format!(
225            "Found <ralph-commit> at pos {commit_start}, <ralph-subject> extracted, body={}, message: '{}'",
226            if has_body { "present" } else { "absent" },
227            if message.len() > 80 {
228                format!("{}...", &message[..80].replace('\n', "\\n"))
229            } else {
230                message.replace('\n', "\\n")
231            }
232        ),
233    )
234}
235
236/// Try to extract commit message from JSON format with detailed tracing.
237///
238/// Returns both the result and a detailed reason string explaining what was found/not found.
239pub fn try_extract_structured_commit_with_trace(content: &str) -> (Option<String>, String) {
240    let trimmed = content.trim();
241    let content_len = trimmed.len();
242
243    // Check for NDJSON stream
244    if looks_like_ndjson(trimmed) {
245        for line in trimmed.lines() {
246            let line = line.trim();
247            if !line.starts_with('{') {
248                continue;
249            }
250            if let Ok(json) = serde_json::from_str::<serde_json::Value>(line) {
251                if json.get("type").and_then(|v| v.as_str()) == Some("result") {
252                    if let Some(result_str) = json.get("result").and_then(|v| v.as_str()) {
253                        if let Some(msg) = try_extract_from_text(result_str) {
254                            return (
255                                Some(msg.clone()),
256                                format!(
257                                    "Extracted from NDJSON result field, message: '{}'",
258                                    if msg.len() > 80 {
259                                        format!("{}...", &msg[..80].replace('\n', "\\n"))
260                                    } else {
261                                        msg.replace('\n', "\\n")
262                                    }
263                                ),
264                            );
265                        }
266                    }
267                }
268            }
269        }
270        return (
271            None,
272            format!("Content looks like NDJSON ({content_len} chars) but no valid commit found in result field"),
273        );
274    }
275
276    // Try extraction from text content
277    if let Some(msg) = try_extract_from_text(trimmed) {
278        return (
279            Some(msg.clone()),
280            format!(
281                "Extracted from JSON/text content, message: '{}'",
282                if msg.len() > 80 {
283                    format!("{}...", &msg[..80].replace('\n', "\\n"))
284                } else {
285                    msg.replace('\n', "\\n")
286                }
287            ),
288        );
289    }
290
291    // Provide detailed failure reason
292    let has_brace = trimmed.contains('{');
293    let has_subject_key = trimmed.contains("\"subject\"");
294
295    if has_brace && has_subject_key {
296        (
297            None,
298            format!(
299                "Content has JSON-like structure ({content_len} chars, has '{{': {has_brace}, has 'subject' key: {has_subject_key}) but parsing failed"
300            ),
301        )
302    } else if has_brace {
303        (
304            None,
305            format!("Content has '{{' but no 'subject' key found ({content_len} chars)"),
306        )
307    } else {
308        (
309            None,
310            format!("Content does not appear to be JSON ({content_len} chars, no '{{' found)"),
311        )
312    }
313}
314
315/// Extract content between XML-style tags.
316///
317/// # Arguments
318///
319/// * `content` - The content to search
320/// * `tag_name` - The tag name (without angle brackets)
321///
322/// # Returns
323///
324/// * `Some(content)` if the tag was found with content
325/// * `None` if the tag was not found or was empty
326fn extract_xml_tag_content(content: &str, tag_name: &str) -> Option<String> {
327    let open_tag = format!("<{tag_name}>");
328    let close_tag = format!("</{tag_name}>");
329
330    let start = content.find(&open_tag)?;
331    let end = content.find(&close_tag)?;
332
333    if start >= end {
334        return None;
335    }
336
337    let inner = &content[start + open_tag.len()..end];
338
339    if inner.is_empty() {
340        None
341    } else {
342        Some(inner.to_string())
343    }
344}
345
346/// Try to extract a structured commit message from text content.
347///
348/// This handles:
349/// - Direct JSON parsing
350/// - JSON inside markdown code fences
351/// - JSON embedded within other text
352fn try_extract_from_text(content: &str) -> Option<String> {
353    let trimmed = content.trim();
354
355    // Try extracting from markdown code fence first
356    if let Some(json_content) = extract_json_from_code_fence(trimmed) {
357        if let Ok(msg) = serde_json::from_str::<StructuredCommitMessage>(&json_content) {
358            return format_structured_commit(&msg);
359        }
360    }
361
362    // Try direct parse
363    if let Ok(msg) = serde_json::from_str::<StructuredCommitMessage>(trimmed) {
364        return format_structured_commit(&msg);
365    }
366
367    // Try to find JSON object within content (in case of minor preamble)
368    if let Some(start) = trimmed.find('{') {
369        if let Some(end) = trimmed.rfind('}') {
370            if start < end {
371                let json_str = &trimmed[start..=end];
372                if let Ok(msg) = serde_json::from_str::<StructuredCommitMessage>(json_str) {
373                    return format_structured_commit(&msg);
374                }
375            }
376        }
377    }
378
379    None
380}
381
382/// Format a structured commit message into the final string format.
383fn format_structured_commit(msg: &StructuredCommitMessage) -> Option<String> {
384    let subject = msg.subject.trim();
385
386    // Unescape JSON escape sequences that may be in the string values
387    // When serde_json parses JSON like `{"subject": "feat: add", "body": "Line 1\\nLine 2"}`
388    // the body field contains the literal characters `\` and `n`, not a newline.
389    // We need to unescape these to get proper formatting.
390    let subject = unescape_json_strings(subject);
391
392    // Validate conventional commit format
393    if !is_conventional_commit_subject(&subject) {
394        return None;
395    }
396
397    // Format the commit message
398    match &msg.body {
399        Some(body) if !body.trim().is_empty() => {
400            let body = unescape_json_strings(body.trim());
401            Some(format!("{subject}\n\n{body}",))
402        }
403        _ => Some(subject),
404    }
405}
406
407/// Check if a string is a valid conventional commit subject line.
408fn is_conventional_commit_subject(subject: &str) -> bool {
409    let valid_types = [
410        "feat", "fix", "docs", "style", "refactor", "perf", "test", "build", "ci", "chore",
411    ];
412
413    // Find the colon
414    let Some(colon_pos) = subject.find(':') else {
415        return false;
416    };
417
418    let prefix = &subject[..colon_pos];
419
420    // Extract type (before optional scope and !)
421    let type_end = prefix
422        .find('(')
423        .unwrap_or_else(|| prefix.find('!').unwrap_or(prefix.len()));
424    let commit_type = &prefix[..type_end];
425
426    valid_types.contains(&commit_type)
427}
428
429/// Extract JSON content from markdown code fences.
430///
431/// Handles both regular code fences and nested fences (e.g., within NDJSON streams).
432fn extract_json_from_code_fence(content: &str) -> Option<String> {
433    // Look for ```json code fence
434    let fence_start = content.find("```json")?;
435    let after_fence = &content[fence_start + 7..]; // Skip past ```json
436
437    // Find the end of the code fence
438    let fence_end = after_fence.find("\n```")?;
439    let json_content = after_fence[..fence_end].trim();
440
441    if json_content.is_empty() {
442        None
443    } else {
444        Some(json_content.to_string())
445    }
446}
447
448/// Check if content looks like NDJSON stream.
449fn looks_like_ndjson(content: &str) -> bool {
450    content.lines().count() > 1 && content.contains("{\"type\":")
451}
452
453/// File count pattern regex - compiled once using `OnceLock` for efficiency.
454/// Matches patterns like "chore: N file(s) changed" for any number N.
455fn file_count_pattern_regex() -> &'static Regex {
456    static RE: OnceLock<Regex> = OnceLock::new();
457    RE.get_or_init(|| {
458        Regex::new(r"^chore:\s*\d+\s+(?:file\(s\)|files?)\s+changed$")
459            .expect("file count regex should be valid")
460    })
461}
462
463// =========================================================================
464// Commit Message Validation Helper Functions
465// =========================================================================
466
467/// Validate basic length requirements for commit message content.
468fn validate_basic_length(content: &str) -> Result<(), String> {
469    // Check for empty
470    if content.is_empty() {
471        return Err("Commit message is empty".to_string());
472    }
473
474    // Check minimum length
475    if content.len() < 5 {
476        return Err(format!(
477            "Commit message too short ({} chars, minimum 5)",
478            content.len()
479        ));
480    }
481
482    // Check maximum length (Git convention: first line <72, total <1000)
483    if content.len() > 2000 {
484        return Err(format!(
485            "Commit message too long ({} chars, maximum 2000)",
486            content.len()
487        ));
488    }
489
490    Ok(())
491}
492
493/// Validate that content does not contain JSON parsing artifacts.
494fn validate_no_json_artifacts(content: &str) -> Result<(), String> {
495    let json_indicators = [
496        r#"{"type":"#,
497        r#"{"result":"#,
498        r#"{"content":"#,
499        r#"{"subject":"#, // Structured commit message JSON that wasn't parsed
500        r#"{"body":"#,    // Partial structured commit JSON
501        r#""session_id":"#,
502        r#""timestamp":"#,
503        "stream_event",
504        "content_block",
505    ];
506    for indicator in json_indicators {
507        if content.contains(indicator) {
508            return Err(format!(
509                "Commit message contains JSON artifacts: {}...",
510                &indicator[..indicator.len().min(20)]
511            ));
512        }
513    }
514    Ok(())
515}
516
517/// Validate that content does not contain literal escape sequences indicating JSON unescaping failure.
518fn validate_no_literal_escape_sequences(content: &str) -> Result<(), String> {
519    // Pattern 1: Body starts with literal \n\n (most common JSON escaping issue)
520    // After a subject line like "feat: add", the body should start with actual newlines,
521    // not literal "\n\n" characters. This indicates the JSON wasn't properly unescaped.
522    let lines: Vec<&str> = content.lines().collect();
523    if lines.len() >= 2 {
524        let second_line = lines[1].trim();
525        // Check if body starts with literal escape sequences
526        if second_line == "\\n" || second_line == "\\n\\n" || second_line.starts_with("\\n\\n") {
527            return Err(
528                "Commit message body appears to contain literal escape sequences (\\n\\n). \
529                 This indicates JSON was not properly unescaped. \
530                 Expected actual newlines after subject line."
531                    .to_string(),
532            );
533        }
534    }
535
536    // Pattern 2: Check for literal escape sequences COMBINED WITH JSON artifacts
537    // This is a safety check for cases where unescaping failed but only when
538    // combined with other JSON indicators that indicate actual parsing failure.
539    // Individual literal \n, \t, \r without JSON artifacts may be legitimate
540    // content in commit messages (e.g., "fix: handle \\n in filenames")
541    let json_and_escape_patterns = [
542        (r#"{"type":"#, "\\n"),
543        (r#"{"result":"#, "\\n"),
544        (r#"{"content":"#, "\\n"),
545        (r#""session_id":"#, "\\n"),
546    ];
547    for (json_pattern, escape_pattern) in json_and_escape_patterns {
548        if content.contains(json_pattern) && content.contains(escape_pattern) {
549            return Err(format!(
550                "Commit message contains both JSON artifacts ({json_pattern}) and literal escape sequences ({escape_pattern}). This indicates JSON parsing failure."
551            ));
552        }
553    }
554
555    // Pattern 3: Check for repeated literal escape sequences that suggest bulk unescaping failure
556    // This catches cases where \\n\\n\\n appears (multiple escaped newlines that weren't processed)
557    if content.contains("\\n\\n\\n") || content.contains("\\n\\n\\n\\n") {
558        return Err(
559            "Commit message contains repeated literal escape sequences (\\n\\n\\n). \
560             This indicates JSON string values were not properly unescaped."
561                .to_string(),
562        );
563    }
564
565    Ok(())
566}
567
568/// Validate that content does not start with error markers.
569fn validate_no_error_markers(content: &str) -> Result<(), String> {
570    let error_markers = [
571        "error:",
572        "failed to",
573        "unable to",
574        "i cannot",
575        "i'm unable",
576        "as an ai",
577        "i don't have access",
578        "cannot generate",
579    ];
580    let content_lower = content.to_lowercase();
581    for marker in error_markers {
582        if content_lower.starts_with(marker) {
583            return Err(format!("Commit message starts with error marker: {marker}"));
584        }
585    }
586    Ok(())
587}
588
589/// Validate that content does not contain agent error messages.
590fn validate_no_agent_errors(content: &str) -> Result<(), String> {
591    // Check for agent error messages that leaked into output
592    // This handles cases where agents output errors in their result field
593    // that bypassed the normal stderr error detection
594    let agent_error_patterns = [
595        "prompt is too long",
596        "token limit exceeded",
597        "context length exceeded",
598        "maximum context",
599        "input too large",
600        "invalid request",
601        "request failed",
602    ];
603    let content_lower = content.to_lowercase();
604    for pattern in &agent_error_patterns {
605        if content_lower.contains(pattern) {
606            return Err(format!(
607                "Output contains agent error message ({pattern}). Cannot use as commit message."
608            ));
609        }
610    }
611    Ok(())
612}
613
614/// Validate that content does not contain AI thought process leakage.
615fn validate_no_thought_process_leakage(content: &str) -> Result<(), String> {
616    let content_lower = content.to_lowercase();
617
618    // Check for AI thought process leakage at the start of the message
619    // This validation catches cases where the filtering in remove_thought_process_patterns
620    // failed to remove the AI analysis before the actual commit message
621    let thought_process_prefixes = [
622        "looking at this diff",
623        "i can see",
624        "the main changes are",
625        "several distinct categories",
626        "key categories",
627        "based on the diff",
628        "analyzing the changes",
629        "this diff shows",
630        "looking at the changes",
631        "i've analyzed",
632        "after reviewing",
633        // Additional patterns to catch more variations
634        "based on the git diff",
635        "here are the changes",
636        "here's what changed",
637        "here is what changed",
638        "the following changes",
639        "changes include",
640        "after reviewing the diff",
641        "after reviewing the changes",
642        "after analyzing",
643        "i've analyzed the changes",
644        "i've analyzed the diff",
645        "key changes",
646        "several changes",
647        "distinct changes",
648        "key changes include",
649        "several changes include",
650        "this diff shows the following",
651    ];
652    for prefix in &thought_process_prefixes {
653        if content_lower.starts_with(prefix) {
654            return Err(format!(
655                "Commit message starts with AI thought process ({prefix}). This indicates a bug in the thought process filtering."
656            ));
657        }
658    }
659
660    // Check for numbered analysis at the start (1., 2., 3., etc.)
661    if content.trim_start().starts_with("1. ")
662        || content.trim_start().starts_with("1)\n")
663        || content_lower.starts_with("- first")
664        || content_lower.starts_with("* first")
665    {
666        return Err(
667            "Commit message starts with numbered analysis. This indicates AI thought process leakage.".to_string()
668        );
669    }
670
671    // Check for formatted thinking output patterns (e.g., "[Claude] Thinking:")
672    // This catches formatted thinking output from CLI display that leaked into the log
673    let formatted_thinking_patterns = [
674        "[claude] thinking:",
675        "[claude] Thinking:",
676        "[agent] thinking:",
677        "[agent] Thinking:",
678        "[assistant] thinking:",
679        "[assistant] Thinking:",
680        "] thinking:",
681        "] Thinking:",
682    ];
683    for pattern in &formatted_thinking_patterns {
684        if content_lower.starts_with(pattern) || content.contains(pattern) {
685            return Err(format!(
686                "Commit message contains formatted thinking pattern ({pattern}). This indicates AI thinking output leaked into the commit message."
687            ));
688        }
689    }
690
691    Ok(())
692}
693
694/// Validate that content does not contain placeholder text.
695///
696/// This function uses regex-based validation with word boundary checks to prevent
697/// false positives from legitimate technical content that may contain words like
698/// "placeholder" in valid contexts (e.g., "placeholder attribute", "template placeholders").
699///
700/// # Why this is complex
701///
702/// We need to distinguish between:
703/// 1. Actual placeholder text that should be rejected: "[commit message]", "placeholder for"
704/// 2. Valid technical uses that should be allowed: "placeholder attribute", "template placeholders"
705///
706/// Simple substring matching would reject legitimate commits. The regex patterns with
707/// word boundaries (\b) and context-specific patterns allow us to be precise.
708///
709/// # Template variables
710///
711/// Template variables like `{{PROMPT}}`, `{{PLAN}}`, `{{DIFF}}` are explicitly excluded
712/// from being flagged as placeholders since they are valid template syntax that gets
713/// substituted before use.
714///
715/// # Valid technical contexts
716///
717/// The `valid_contexts` list contains phrases where "placeholder" appears in legitimate
718/// technical documentation. This list is intentionally conservative - when in doubt,
719/// we prefer to reject and let the user clarify rather than accept an ambiguous commit.
720fn validate_no_placeholders(content: &str) -> Result<(), String> {
721    // Valid template variables that should NOT be flagged as placeholders
722    let valid_template_vars = ["{{prompt}}", "{{plan}}", "{{diff}}"];
723    let content_lower = content.to_lowercase();
724
725    // Check if content consists ONLY of valid template variables (no other text)
726    // This is valid - template variables alone are not placeholders
727    let is_only_valid_template_var = valid_template_vars.iter().any(|v| content_lower == *v);
728
729    // Placeholder patterns with word boundary checks
730    // These patterns must appear as complete phrases, not as substrings
731    let placeholder_patterns = [
732        r"(?i)\[commit message\]",           // [commit message]
733        r"(?i)<commit message>",             // <commit message>
734        r"(?i)\byour commit message here\b", // your commit message here (as standalone phrase)
735        r"(?i)\[commit\s*\]",                // [commit] or [commit  ]
736        r"(?i)<commit\s*>",                  // <commit> or <commit  >
737        r"(?i)\[insert\b",                   // [insert (but not [inserted])
738        r"(?i)<insert\b",                    // <insert (but not <inserted>)
739    ];
740
741    // Build a regex that matches any of the placeholder patterns
742    let combined_pattern = placeholder_patterns.join("|");
743    if let Ok(re) = regex::Regex::new(&combined_pattern) {
744        if re.is_match(content) {
745            // Only error if this is NOT just a valid template variable
746            // (e.g., "{{prompt}} [commit message]" should still fail)
747            if !is_only_valid_template_var {
748                return Err(
749                    "Commit message contains placeholder text (e.g., '[commit message]', '<commit message>', or similar)".to_string()
750                );
751            }
752        }
753    }
754
755    // For "placeholder" specifically, only flag it if it appears as a standalone
756    // word that isn't part of a valid technical term like "placeholder attribute"
757    // We check for phrases like "is a placeholder", "placeholder for", etc.
758    let placeholder_context_patterns = [
759        r"(?i)\bplaceholder\s+for\b",              // "placeholder for"
760        r"(?i)\bplaceholder\s+text\b",             // "placeholder text"
761        r"(?i)\bplaceholder\s+here\b",             // "placeholder here"
762        r"(?i)\bplaceholder\s*\)",                 // "placeholder)"
763        r"(?i)is\s+(?:a\s+)?placeholder\b",        // "is a placeholder" or "is placeholder"
764        r"(?i)this\s+is\s+(?:a\s+)?placeholder\b", // "this is a placeholder" or "this is placeholder"
765    ];
766
767    let combined_placeholder_context = placeholder_context_patterns.join("|");
768    if let Ok(re) = regex::Regex::new(&combined_placeholder_context) {
769        if re.is_match(content) {
770            return Err(
771                "Commit message contains placeholder text (e.g., 'placeholder for', 'placeholder text', or similar)".to_string()
772            );
773        }
774    }
775
776    // Also check for bare "placeholder" word with word boundaries, but exclude
777    // common valid technical uses like "placeholder attribute", "template placeholders",
778    // or valid operations like "remove placeholder from UI", "delete placeholder", etc.
779    let bare_placeholder = regex::Regex::new(r"(?i)\bplaceholder\b").unwrap();
780    if bare_placeholder.is_match(content) {
781        let content_lower_for_ctx = content.to_lowercase();
782
783        // Check for common valid action verbs that clearly indicate this is about
784        // manipulating existing placeholders (not creating placeholder text)
785        // These are "destructive" or "fixing" actions that clearly indicate real work
786        let valid_action_patterns = [
787            r"(?i)\bremove\s+.*?\bplaceholder\b",
788            r"(?i)\bdelete\s+.*?\bplaceholder\b",
789            r"(?i)\bfix\s+.*?\bplaceholder\b",
790            r"(?i)\bclear\s+.*?\bplaceholder\b",
791            // Specific UI context patterns
792            r"(?i)\bplaceholder\s+in\s+(?:the\s+)?(?:ui|form|input|field)\b",
793            r"(?i)\bfrom\s+(?:the\s+)?(?:ui|form|input|field).*\bplaceholder\b",
794            r"(?i)\bplaceholder\s+(?:text|value|content)\b",
795        ];
796
797        // First check action patterns - if any match, this is valid
798        let combined_actions = valid_action_patterns.join("|");
799        if let Ok(action_re) = regex::Regex::new(&combined_actions) {
800            if action_re.is_match(content) {
801                return Ok(());
802            }
803        }
804
805        // Check if it's in a valid technical context (legacy check)
806        let valid_contexts = [
807            // HTML/UI attribute contexts
808            "placeholder attribute",
809            "placeholder element",
810            "placeholder div",
811            "placeholder span",
812            "placeholder class",
813            // Template/variable contexts
814            "template placeholders",
815            "template placeholder",
816            "placeholder variable",
817            "placeholder variables",
818            "placeholder value",
819            "placeholder values",
820            // Substitution contexts
821            "substitute placeholder",
822            "substituting placeholder",
823            "replace placeholder",
824            "replacing placeholder",
825        ];
826
827        let mut in_valid_context = false;
828        for valid_ctx in &valid_contexts {
829            if content_lower_for_ctx.contains(valid_ctx) {
830                in_valid_context = true;
831                break;
832            }
833        }
834
835        if !in_valid_context {
836            return Err(
837                "Commit message contains 'placeholder'. If this refers to technical concepts (templates, variables, etc.), use more specific language like 'template placeholders' or 'placeholder variable'".to_string()
838            );
839        }
840    }
841
842    Ok(())
843}
844
845/// Validate that content does not match bad commit message patterns.
846fn validate_no_bad_patterns(content: &str) -> Result<(), String> {
847    let content_lower = content.to_lowercase();
848
849    // Check for bad commit message patterns (vague, meaningless messages)
850    // Use regex to catch ALL variants, not just hardcoded numbers
851
852    // Pattern 1: "chore: N file(s) changed" for ANY number N
853    // Handles: "file(s) changed", "files changed", "file changed" variations
854    if file_count_pattern_regex().is_match(&content_lower) {
855        return Err(format!(
856            "Commit message matches bad pattern (file count pattern): '{content}'. Use semantic description instead."
857        ));
858    }
859
860    // Pattern 2: Generic vague patterns
861    let vague_patterns = [
862        ("chore: apply changes", "vague 'apply changes' pattern"),
863        ("chore: update code", "vague 'update code' pattern"),
864    ];
865    for (pattern, description) in vague_patterns {
866        if content_lower == pattern {
867            return Err(format!(
868                "Commit message matches bad pattern ({description}): {pattern}"
869            ));
870        }
871    }
872
873    // Check for filename list patterns like "chore: update src/file.rs" or "chore: src/file.rs, src/other.rs"
874    // These are bad because they just list filenames without semantic meaning
875    let first_line = content.lines().next().unwrap_or(content);
876    let first_line_lower = first_line.to_lowercase();
877
878    // Check both "chore: update <path>" and "chore: <path>" patterns
879    if first_line_lower.starts_with("chore: update ") || first_line_lower.starts_with("chore:") {
880        let subject = first_line_lower
881            .replacen("chore: update ", "", 1)
882            .replacen("chore:", "", 1)
883            .trim()
884            .to_string();
885
886        // Check if subject looks like a file path or list of file paths
887        // File paths contain '/' or end with common extensions
888        // We need to check for multiple patterns:
889        // 1. Single file path: "src/file.rs"
890        // 2. Multiple files with commas: "src/a.rs, src/b.rs"
891        // 3. Multiple files with "and": "src/a.rs and src/b.rs"
892
893        // Common code file extensions
894        let code_extensions = [
895            ".rs", ".js", ".ts", ".py", ".go", ".java", ".c", ".cpp", ".h", ".cs", ".php", ".rb",
896            ".swift", ".kt",
897        ];
898
899        // Check if subject looks like a file path or list of file paths
900        let looks_like_file_list = subject.contains('/')
901            || subject.contains('\\') || // Windows paths
902            code_extensions.iter().any(|ext| subject.ends_with(ext));
903
904        // Additional check: if there are commas and file extensions, it's definitely a file list
905        let has_comma_separated_files =
906            subject.contains(", ") && code_extensions.iter().any(|ext| subject.contains(ext));
907
908        // Check for "and" separated files
909        let has_and_separated_files =
910            subject.contains(" and ") && code_extensions.iter().any(|ext| subject.contains(ext));
911
912        if looks_like_file_list || has_comma_separated_files || has_and_separated_files {
913            return Err(format!(
914                "Commit message appears to be a file list: '{}'. Use semantic description instead.",
915                first_line.trim()
916            ));
917        }
918    }
919
920    Ok(())
921}
922
923/// Validate extracted content for use as a commit message.
924///
925/// # Returns
926///
927/// `Ok(())` if valid, `Err(reason)` if invalid
928pub fn validate_commit_message(content: &str) -> Result<(), String> {
929    let content = content.trim();
930
931    // Run all validation checks in order
932    validate_basic_length(content)?;
933    validate_no_json_artifacts(content)?;
934    validate_no_literal_escape_sequences(content)?;
935    validate_no_error_markers(content)?;
936    validate_no_agent_errors(content)?;
937    validate_no_thought_process_leakage(content)?;
938    validate_no_placeholders(content)?;
939    validate_no_bad_patterns(content)?;
940
941    Ok(())
942}
943
944/// Result of a single validation check.
945#[derive(Debug, Clone)]
946pub struct ValidationCheckResult {
947    /// Name of the validation check.
948    pub name: &'static str,
949    /// Whether the check passed.
950    pub passed: bool,
951    /// Error message if the check failed.
952    pub error: Option<String>,
953}
954
955impl ValidationCheckResult {
956    /// Create a passing check result.
957    const fn pass(name: &'static str) -> Self {
958        Self {
959            name,
960            passed: true,
961            error: None,
962        }
963    }
964
965    /// Create a failing check result.
966    const fn fail(name: &'static str, error: String) -> Self {
967        Self {
968            name,
969            passed: false,
970            error: Some(error),
971        }
972    }
973}
974
975/// Complete validation report showing all checks run.
976///
977/// Unlike `validate_commit_message` which short-circuits on first failure,
978/// this runs ALL checks and reports results for each one.
979#[derive(Debug, Clone)]
980pub struct ValidationReport {
981    /// Results of all validation checks.
982    pub checks: Vec<ValidationCheckResult>,
983}
984
985impl ValidationReport {
986    /// Check if all validations passed.
987    pub fn all_passed(&self) -> bool {
988        self.checks.iter().all(|c| c.passed)
989    }
990
991    /// Format all failed checks as a descriptive string.
992    pub fn format_failures(&self) -> Option<String> {
993        let failures: Vec<_> = self
994            .checks
995            .iter()
996            .filter(|c| !c.passed)
997            .map(|c| format!("{}: {}", c.name, c.error.as_deref().unwrap_or("failed")))
998            .collect();
999
1000        if failures.is_empty() {
1001            None
1002        } else {
1003            Some(failures.join("; "))
1004        }
1005    }
1006}
1007
1008/// Validate commit message and return detailed report of all checks.
1009///
1010/// Unlike `validate_commit_message`, this does NOT short-circuit on first failure.
1011/// It runs all checks and returns a complete report.
1012pub fn validate_commit_message_with_report(content: &str) -> ValidationReport {
1013    let content = content.trim();
1014
1015    // Run all validation checks without short-circuiting
1016    let checks = vec![
1017        match validate_basic_length(content) {
1018            Ok(()) => ValidationCheckResult::pass("basic_length"),
1019            Err(e) => ValidationCheckResult::fail("basic_length", e),
1020        },
1021        match validate_no_json_artifacts(content) {
1022            Ok(()) => ValidationCheckResult::pass("no_json_artifacts"),
1023            Err(e) => ValidationCheckResult::fail("no_json_artifacts", e),
1024        },
1025        match validate_no_literal_escape_sequences(content) {
1026            Ok(()) => ValidationCheckResult::pass("no_literal_escape_sequences"),
1027            Err(e) => ValidationCheckResult::fail("no_literal_escape_sequences", e),
1028        },
1029        match validate_no_error_markers(content) {
1030            Ok(()) => ValidationCheckResult::pass("no_error_markers"),
1031            Err(e) => ValidationCheckResult::fail("no_error_markers", e),
1032        },
1033        match validate_no_agent_errors(content) {
1034            Ok(()) => ValidationCheckResult::pass("no_agent_errors"),
1035            Err(e) => ValidationCheckResult::fail("no_agent_errors", e),
1036        },
1037        match validate_no_thought_process_leakage(content) {
1038            Ok(()) => ValidationCheckResult::pass("no_thought_process_leakage"),
1039            Err(e) => ValidationCheckResult::fail("no_thought_process_leakage", e),
1040        },
1041        match validate_no_placeholders(content) {
1042            Ok(()) => ValidationCheckResult::pass("no_placeholders"),
1043            Err(e) => ValidationCheckResult::fail("no_placeholders", e),
1044        },
1045        match validate_no_bad_patterns(content) {
1046            Ok(()) => ValidationCheckResult::pass("no_bad_patterns"),
1047            Err(e) => ValidationCheckResult::fail("no_bad_patterns", e),
1048        },
1049    ];
1050
1051    ValidationReport { checks }
1052}
1053
1054// =========================================================================
1055// Final Commit Message Rendering
1056// =========================================================================
1057
1058/// Render the final commit message with all cleanup applied.
1059///
1060/// This is the final step before returning a commit message for use in git commit.
1061/// It applies:
1062/// 1. Escape sequence cleanup (aggressive unescaping)
1063/// 2. Validation to ensure no escape sequences leaked through
1064/// 3. Final formatting checks
1065///
1066/// If rendering fails (e.g., validation still fails after cleanup), this returns
1067/// the best effort cleaned message. The caller should handle validation failures.
1068///
1069/// # Arguments
1070///
1071/// * `message` - The commit message to render
1072///
1073/// # Returns
1074///
1075/// The fully rendered commit message with all escape sequences properly handled.
1076pub fn render_final_commit_message(message: &str) -> String {
1077    let mut result = message.to_string();
1078
1079    // Step 1: Apply final escape sequence cleanup
1080    // This handles any escape sequences that leaked through the pipeline
1081    result = final_escape_sequence_cleanup(&result);
1082
1083    // Step 2: Validate the result
1084    // If validation fails due to escape sequences, try aggressive cleanup
1085    if let Err(e) = validate_commit_message(&result) {
1086        // Check if the error is about escape sequences
1087        let error_lower = e.to_lowercase();
1088        if error_lower.contains("escape sequence") || error_lower.contains("\\n") {
1089            // Apply aggressive unescaping
1090            result = unescape_json_strings_aggressive(&result);
1091        }
1092        // Note: We don't re-validate here because the caller should handle validation
1093        // We just do our best to clean up the message
1094    }
1095
1096    // Step 3: Final whitespace cleanup
1097    result = result
1098        .lines()
1099        .map(str::trim)
1100        .filter(|l| !l.is_empty())
1101        .collect::<Vec<_>>()
1102        .join("\n");
1103
1104    result
1105}
1106
1107// =========================================================================
1108// Commit Message Recovery Functions
1109// =========================================================================
1110
1111/// Attempt to salvage a valid commit message from content that failed validation.
1112///
1113/// Uses aggressive pattern matching to find a conventional commit message
1114/// embedded within mixed AI output (thinking + actual message).
1115///
1116/// This is used when `validate_commit_message()` fails, to try extracting a valid
1117/// commit message from mixed output containing thinking patterns and actual content.
1118///
1119/// # Returns
1120///
1121/// `Some(message)` if a valid commit message was salvaged, `None` otherwise.
1122pub fn try_salvage_commit_message(content: &str) -> Option<String> {
1123    // Look for conventional commit pattern anywhere in the content
1124    let commit_pos = find_conventional_commit_start(content)?;
1125
1126    // Extract from that position
1127    let from_commit = &content[commit_pos..];
1128
1129    // Find where the commit message ends (next blank line or end of content)
1130    // But include the body paragraph if it follows immediately
1131    let lines: Vec<&str> = from_commit.lines().collect();
1132
1133    if lines.is_empty() {
1134        return None;
1135    }
1136
1137    // First line is the subject
1138    let subject = lines[0].trim();
1139    if subject.is_empty() {
1140        return None;
1141    }
1142
1143    // Collect body lines (everything after subject until double newline or analysis)
1144    let mut body_lines: Vec<&str> = Vec::new();
1145    let mut found_blank = false;
1146
1147    for line in lines.iter().skip(1) {
1148        let trimmed: &str = line.trim();
1149
1150        if trimmed.is_empty() {
1151            if found_blank {
1152                // Double blank line - end of message
1153                break;
1154            }
1155            found_blank = true;
1156            body_lines.push("");
1157            continue;
1158        }
1159
1160        // Check if this line looks like analysis starting up again
1161        if looks_like_analysis_text(trimmed)
1162            || trimmed.starts_with("1. ")
1163            || trimmed.starts_with("- ")
1164            || trimmed.starts_with("* ")
1165        {
1166            break;
1167        }
1168
1169        body_lines.push(trimmed);
1170        found_blank = false;
1171    }
1172
1173    // Build the salvaged message
1174    let mut salvaged = subject.to_string();
1175    if !body_lines.is_empty() {
1176        // Remove trailing empty lines from body
1177        while body_lines.last().is_some_and(|l| l.is_empty()) {
1178            body_lines.pop();
1179        }
1180        if !body_lines.is_empty() {
1181            salvaged.push('\n');
1182            salvaged.push_str(&body_lines.join("\n"));
1183        }
1184    }
1185
1186    // Validate the salvaged message before returning
1187    match validate_commit_message(&salvaged) {
1188        Ok(()) => Some(salvaged),
1189        Err(_) => None,
1190    }
1191}
1192
1193/// Generate a deterministic fallback commit message from diff metadata.
1194///
1195/// This is the last resort when:
1196/// 1. Agent output extraction failed
1197/// 2. Salvage attempt failed
1198///
1199/// # Arguments
1200///
1201/// * `diff` - The git diff content
1202///
1203/// # Returns
1204///
1205/// A valid commit message based on changed files. The message is designed to:
1206/// - Use "chore" type with an appropriate scope
1207/// - Describe the change semantically (not just file names)
1208/// - Pass validation (avoids bad patterns like file lists)
1209pub fn generate_fallback_commit_message(diff: &str) -> String {
1210    let files = extract_files_from_diff(diff);
1211
1212    if files.is_empty() {
1213        // No files found in diff - minimal fallback
1214        return "chore: apply automated changes".to_string();
1215    }
1216
1217    // Find common directory to use as scope
1218    let common_dir = find_common_directory(&files);
1219
1220    // Derive a scope from the common directory
1221    let scope = common_dir
1222        .as_ref()
1223        .and_then(|dir| derive_scope_from_path(dir));
1224
1225    // Determine if this is a single file or multiple files
1226    let file_count = files.len();
1227
1228    // Build the commit message
1229    match (file_count, scope) {
1230        (1, Some(scope)) => {
1231            // Single file with a scope - use scope in message
1232            format!("chore({scope}): update module")
1233        }
1234        (1, None) => {
1235            // Single file without clear scope
1236            files
1237                .first()
1238                .and_then(|f| derive_scope_from_path(f))
1239                .map_or_else(
1240                    || "chore: update module".to_string(),
1241                    |component| format!("chore({component}): update module"),
1242                )
1243        }
1244        (n, Some(scope)) => {
1245            // Multiple files with common scope
1246            format!("chore({scope}): update {n} components")
1247        }
1248        (n, None) => {
1249            // Multiple files without common scope
1250            // Try to find any meaningful scope from the first file
1251            files
1252                .first()
1253                .and_then(|f| derive_scope_from_path(f))
1254                .map_or_else(
1255                    || format!("chore: update {n} components"),
1256                    |component| format!("chore({component}): update {n} components"),
1257                )
1258        }
1259    }
1260}
1261
1262/// Extract changed file paths from a git diff.
1263///
1264/// Parses diff headers like `diff --git a/path/to/file b/path/to/file` to get file paths.
1265fn extract_files_from_diff(diff: &str) -> Vec<String> {
1266    let mut files = Vec::new();
1267
1268    for line in diff.lines() {
1269        // Match "diff --git a/<path> b/<path>" pattern
1270        if let Some(rest) = line.strip_prefix("diff --git a/") {
1271            // The path is up to " b/" (space before b/)
1272            if let Some(space_b_pos) = rest.find(" b/") {
1273                let path = &rest[..space_b_pos];
1274                if !path.is_empty() {
1275                    files.push(path.to_string());
1276                }
1277            }
1278        }
1279    }
1280
1281    files
1282}
1283
1284/// Find the common directory prefix for a set of file paths.
1285///
1286/// Returns the longest common directory path shared by all files.
1287fn find_common_directory(paths: &[String]) -> Option<String> {
1288    if paths.is_empty() {
1289        return None;
1290    }
1291
1292    if paths.len() == 1 {
1293        // For a single file, return its parent directory
1294        let path = &paths[0];
1295        if let Some(last_slash) = path.rfind('/') {
1296            return Some(path[..last_slash].to_string());
1297        }
1298        return None;
1299    }
1300
1301    // Split all paths into components
1302    let split_paths: Vec<Vec<&str>> = paths.iter().map(|p| p.split('/').collect()).collect();
1303
1304    // Find common prefix
1305    let mut common_components: Vec<&str> = Vec::new();
1306
1307    // Use the first path as reference
1308    let first = &split_paths[0];
1309    for (i, component) in first.iter().enumerate() {
1310        // Check if all other paths have this component at this position
1311        let all_match = split_paths.iter().skip(1).all(|path| {
1312            // Don't compare the filename itself (last component)
1313            i < path.len().saturating_sub(1) && path.get(i) == Some(component)
1314        });
1315
1316        if all_match && i < first.len().saturating_sub(1) {
1317            common_components.push(component);
1318        } else {
1319            break;
1320        }
1321    }
1322
1323    if common_components.is_empty() {
1324        None
1325    } else {
1326        Some(common_components.join("/"))
1327    }
1328}
1329
1330/// Derive a semantic scope name from a file path.
1331///
1332/// Extracts a meaningful component name from the path, preferring:
1333/// - Last directory name for nested paths (e.g., "files" from "src/files/extraction.rs")
1334/// - First directory for shallow paths (e.g., "src" from "src/lib.rs")
1335fn derive_scope_from_path(path: &str) -> Option<String> {
1336    let components: Vec<&str> = path.split('/').collect();
1337
1338    if components.is_empty() {
1339        return None;
1340    }
1341
1342    // Filter out common non-semantic directories
1343    let skip_dirs = ["src", "lib", "bin", "tests", "test", "benches", "examples"];
1344
1345    // Try to find a meaningful component (prefer second-to-last directory)
1346    for component in components.iter().rev().skip(1) {
1347        let comp_lower = component.to_lowercase();
1348        if !skip_dirs.contains(&comp_lower.as_str()) && !component.is_empty() {
1349            return Some(component.to_string());
1350        }
1351    }
1352
1353    // If all directories are skipped, try the first non-skip directory
1354    for component in &components {
1355        if !skip_dirs.contains(&component.to_lowercase().as_str())
1356            && !component.is_empty()
1357            && !component.contains('.')
1358        {
1359            return Some(component.to_string());
1360        }
1361    }
1362
1363    None
1364}
1365
1366#[cfg(test)]
1367mod tests {
1368    use super::*;
1369
1370    #[test]
1371    fn test_validate_empty_message() {
1372        let result = validate_commit_message("");
1373        assert!(result.is_err());
1374        assert!(result.unwrap_err().contains("empty"));
1375    }
1376
1377    #[test]
1378    fn test_validate_too_short() {
1379        let result = validate_commit_message("fix");
1380        assert!(result.is_err());
1381        assert!(result.unwrap_err().contains("too short"));
1382    }
1383
1384    #[test]
1385    fn test_validate_valid_message() {
1386        let result = validate_commit_message("feat: add new feature");
1387        assert!(result.is_ok());
1388    }
1389
1390    #[test]
1391    fn test_validate_json_artifacts() {
1392        let result = validate_commit_message("feat: add feature {\"type\":\"result\"}");
1393        assert!(result.is_err());
1394        assert!(result.unwrap_err().contains("JSON artifacts"));
1395    }
1396
1397    #[test]
1398    fn test_validate_error_markers() {
1399        let result = validate_commit_message("error: unable to generate");
1400        assert!(result.is_err());
1401        assert!(result.unwrap_err().contains("error marker"));
1402    }
1403
1404    #[test]
1405    fn test_validate_thought_process_leakage() {
1406        let result = validate_commit_message("Looking at this diff, I can see changes");
1407        assert!(result.is_err());
1408        assert!(result.unwrap_err().contains("AI thought process"));
1409    }
1410
1411    #[test]
1412    fn test_validate_numbered_analysis() {
1413        let result = validate_commit_message("1. First change\n2. Second change");
1414        assert!(result.is_err());
1415        assert!(result.unwrap_err().contains("numbered analysis"));
1416    }
1417
1418    #[test]
1419    fn test_validate_bad_file_count_pattern() {
1420        let result = validate_commit_message("chore: 5 files changed");
1421        assert!(result.is_err());
1422        assert!(result.unwrap_err().contains("file count pattern"));
1423    }
1424
1425    #[test]
1426    fn test_validate_file_list_pattern() {
1427        let result = validate_commit_message("chore: update src/file.rs");
1428        assert!(result.is_err());
1429        assert!(result.unwrap_err().contains("file list"));
1430    }
1431
1432    #[test]
1433    fn test_try_salvage_commit_message() {
1434        let content = "Looking at this diff...\n\nfeat: add feature";
1435        let salvaged = try_salvage_commit_message(content);
1436        assert!(salvaged.is_some());
1437        assert_eq!(salvaged.unwrap(), "feat: add feature");
1438    }
1439
1440    #[test]
1441    fn test_try_salvage_with_body() {
1442        let content = "Analysis text\n\nfix(parser): resolve bug\n\nAdd proper error handling.";
1443        let salvaged = try_salvage_commit_message(content);
1444        assert!(salvaged.is_some());
1445        let msg = salvaged.unwrap();
1446        assert!(msg.starts_with("fix(parser):"));
1447        assert!(msg.contains("Add proper error handling"));
1448    }
1449
1450    #[test]
1451    fn test_generate_fallback_empty_diff() {
1452        let fallback = generate_fallback_commit_message("");
1453        assert_eq!(fallback, "chore: apply automated changes");
1454    }
1455
1456    #[test]
1457    fn test_generate_fallback_single_file() {
1458        let diff = r"diff --git a/src/files/extraction.rs b/src/files/extraction.rs";
1459        let fallback = generate_fallback_commit_message(diff);
1460        assert!(validate_commit_message(&fallback).is_ok());
1461        assert!(fallback.contains("files") || fallback.contains("update"));
1462    }
1463
1464    #[test]
1465    fn test_generate_fallback_multiple_files_same_dir() {
1466        let diff = r"diff --git a/src/files/a.rs b/src/files/a.rs
1467diff --git a/src/files/b.rs b/src/files/b.rs";
1468        let fallback = generate_fallback_commit_message(diff);
1469        assert!(validate_commit_message(&fallback).is_ok());
1470        assert!(fallback.contains("files") || fallback.contains("components"));
1471    }
1472
1473    #[test]
1474    fn test_generate_fallback_multiple_dirs() {
1475        let diff = r"diff --git a/src/a.rs b/src/a.rs
1476diff --git a/lib/b.rs b/lib/b.rs
1477diff --git a/tests/c.rs b/tests/c.rs";
1478        let fallback = generate_fallback_commit_message(diff);
1479        assert!(validate_commit_message(&fallback).is_ok());
1480        assert!(fallback.contains("3 components") || fallback.contains("chore"));
1481    }
1482
1483    #[test]
1484    fn test_regression_thinking_leakage_recovery() {
1485        // The exact scenario from the bug report
1486        let log_content = r"[Claude] Thinking: Looking at this diff, I need to analyze...
1487
1488feat(pipeline): add recovery mechanism
1489
1490When commit validation fails, attempt to salvage valid message.";
1491
1492        // Verify salvage recovers it
1493        let salvaged = try_salvage_commit_message(log_content);
1494        assert!(salvaged.is_some());
1495        let msg = salvaged.unwrap();
1496        assert!(validate_commit_message(&msg).is_ok());
1497        assert!(msg.starts_with("feat(pipeline):"));
1498    }
1499
1500    #[test]
1501    fn test_extract_files_from_diff() {
1502        let diff = r"diff --git a/src/files/extraction.rs b/src/files/extraction.rs
1503--- a/src/files/extraction.rs
1504+++ b/src/files/extraction.rs
1505diff --git a/src/phases/commit.rs b/src/phases/commit.rs
1506--- a/src/phases/commit.rs
1507+++ b/src/phases/commit.rs";
1508
1509        let files = extract_files_from_diff(diff);
1510        assert_eq!(files.len(), 2);
1511        assert_eq!(files[0], "src/files/extraction.rs");
1512        assert_eq!(files[1], "src/phases/commit.rs");
1513    }
1514
1515    #[test]
1516    fn test_find_common_directory_same_dir() {
1517        let paths = vec![
1518            "src/files/a.rs".to_string(),
1519            "src/files/b.rs".to_string(),
1520            "src/files/c.rs".to_string(),
1521        ];
1522        let common = find_common_directory(&paths);
1523        assert_eq!(common, Some("src/files".to_string()));
1524    }
1525
1526    #[test]
1527    fn test_find_common_directory_partial_overlap() {
1528        let paths = vec![
1529            "src/files/extraction.rs".to_string(),
1530            "src/phases/commit.rs".to_string(),
1531        ];
1532        let common = find_common_directory(&paths);
1533        assert_eq!(common, Some("src".to_string()));
1534    }
1535
1536    #[test]
1537    fn test_find_common_directory_no_overlap() {
1538        let paths = vec!["src/a.rs".to_string(), "lib/b.rs".to_string()];
1539        let common = find_common_directory(&paths);
1540        assert!(common.is_none());
1541    }
1542
1543    #[test]
1544    fn test_derive_scope_from_path() {
1545        // Should extract "files" from nested path
1546        assert_eq!(
1547            derive_scope_from_path("src/files/extraction.rs"),
1548            Some("files".to_string())
1549        );
1550
1551        // Should extract "phases" from path
1552        assert_eq!(
1553            derive_scope_from_path("src/phases/commit.rs"),
1554            Some("phases".to_string())
1555        );
1556
1557        // Should skip "src" as non-semantic
1558        assert_ne!(
1559            derive_scope_from_path("src/files/foo.rs"),
1560            Some("src".to_string())
1561        );
1562    }
1563
1564    #[test]
1565    fn test_derive_scope_from_shallow_path() {
1566        // For shallow paths like "foo.rs" or "src/lib.rs", should return None or meaningful component
1567        let scope = derive_scope_from_path("lib.rs");
1568        // lib.rs has no meaningful directory scope
1569        assert!(scope.is_none());
1570    }
1571
1572    // =========================================================================
1573    // Tests for agent error detection in output
1574    // =========================================================================
1575
1576    #[test]
1577    fn test_detect_agent_errors_in_output_prompt_too_long() {
1578        // "Prompt is too long" should be detected as TokenExhausted
1579        let content = r#"{"type":"result","result":"Prompt is too long"}"#;
1580        assert_eq!(
1581            detect_agent_errors_in_output(content),
1582            Some(AgentErrorKind::TokenExhausted)
1583        );
1584    }
1585
1586    #[test]
1587    fn test_detect_agent_errors_in_output_token_limit() {
1588        // "token limit exceeded" should be detected as TokenExhausted
1589        let content = r#"{"type":"result","result":"token limit exceeded"}"#;
1590        assert_eq!(
1591            detect_agent_errors_in_output(content),
1592            Some(AgentErrorKind::TokenExhausted)
1593        );
1594    }
1595
1596    #[test]
1597    fn test_detect_agent_errors_in_output_context_length() {
1598        // "context length exceeded" should be detected as TokenExhausted
1599        let content = "error: context length exceeded for this model";
1600        assert_eq!(
1601            detect_agent_errors_in_output(content),
1602            Some(AgentErrorKind::TokenExhausted)
1603        );
1604    }
1605
1606    #[test]
1607    fn test_detect_agent_errors_in_output_maximum_context() {
1608        // "maximum context" should be detected as TokenExhausted
1609        let content = "maximum context size reached";
1610        assert_eq!(
1611            detect_agent_errors_in_output(content),
1612            Some(AgentErrorKind::TokenExhausted)
1613        );
1614    }
1615
1616    #[test]
1617    fn test_detect_agent_errors_in_output_input_too_large() {
1618        // "input too large" should be detected as TokenExhausted
1619        let content = "input too large for this model";
1620        assert_eq!(
1621            detect_agent_errors_in_output(content),
1622            Some(AgentErrorKind::TokenExhausted)
1623        );
1624    }
1625
1626    #[test]
1627    fn test_detect_agent_errors_in_output_invalid_request() {
1628        // "invalid request" should be detected as InvalidResponse
1629        let content = "invalid request to the API";
1630        assert_eq!(
1631            detect_agent_errors_in_output(content),
1632            Some(AgentErrorKind::InvalidResponse)
1633        );
1634    }
1635
1636    #[test]
1637    fn test_detect_agent_errors_in_output_request_failed() {
1638        // "request failed" should be detected as InvalidResponse
1639        let content = "request failed due to server error";
1640        assert_eq!(
1641            detect_agent_errors_in_output(content),
1642            Some(AgentErrorKind::InvalidResponse)
1643        );
1644    }
1645
1646    #[test]
1647    fn test_detect_agent_errors_in_output_valid_commit_message() {
1648        // Normal commit message should return None
1649        let content = r#"{"type":"result","result":"feat: add feature"}"#;
1650        assert_eq!(detect_agent_errors_in_output(content), None);
1651    }
1652
1653    #[test]
1654    fn test_detect_agent_errors_in_output_case_insensitive() {
1655        // Detection should be case-insensitive
1656        let content = "PROMPT IS TOO LONG";
1657        assert_eq!(
1658            detect_agent_errors_in_output(content),
1659            Some(AgentErrorKind::TokenExhausted)
1660        );
1661    }
1662
1663    // =========================================================================
1664    // Tests for enhanced error detection (Step 4 improvements)
1665    // =========================================================================
1666
1667    #[test]
1668    fn test_detect_agent_errors_context_window() {
1669        // "context window" should be detected as TokenExhausted
1670        let content = "error: context window exceeded";
1671        assert_eq!(
1672            detect_agent_errors_in_output(content),
1673            Some(AgentErrorKind::TokenExhausted)
1674        );
1675    }
1676
1677    #[test]
1678    fn test_detect_agent_errors_max_tokens() {
1679        // "max tokens" should be detected as TokenExhausted
1680        let content = "max tokens exceeded for this request";
1681        assert_eq!(
1682            detect_agent_errors_in_output(content),
1683            Some(AgentErrorKind::TokenExhausted)
1684        );
1685    }
1686
1687    #[test]
1688    fn test_detect_agent_errors_token_limit() {
1689        // "token limit" should be detected as TokenExhausted
1690        let content = "token limit reached";
1691        assert_eq!(
1692            detect_agent_errors_in_output(content),
1693            Some(AgentErrorKind::TokenExhausted)
1694        );
1695    }
1696
1697    #[test]
1698    fn test_detect_agent_errors_too_many_tokens() {
1699        // "too many tokens" should be detected as TokenExhausted
1700        let content = "error: too many tokens in input";
1701        assert_eq!(
1702            detect_agent_errors_in_output(content),
1703            Some(AgentErrorKind::TokenExhausted)
1704        );
1705    }
1706
1707    #[test]
1708    fn test_detect_agent_errors_exceeds_context() {
1709        // "exceeds context" should be detected as TokenExhausted
1710        let content = "input exceeds context length";
1711        assert_eq!(
1712            detect_agent_errors_in_output(content),
1713            Some(AgentErrorKind::TokenExhausted)
1714        );
1715    }
1716
1717    #[test]
1718    fn test_detect_agent_errors_model_context_length() {
1719        // "model's context length" should be detected as TokenExhausted
1720        let content = "input exceeds the model's context length";
1721        assert_eq!(
1722            detect_agent_errors_in_output(content),
1723            Some(AgentErrorKind::TokenExhausted)
1724        );
1725    }
1726
1727    #[test]
1728    fn test_detect_agent_errors_input_exceeds() {
1729        // "input exceeds" should be detected as TokenExhausted
1730        let content = "input exceeds maximum length";
1731        assert_eq!(
1732            detect_agent_errors_in_output(content),
1733            Some(AgentErrorKind::TokenExhausted)
1734        );
1735    }
1736
1737    #[test]
1738    fn test_detect_agent_errors_api_error() {
1739        // "api error" should be detected as InvalidResponse
1740        let content = "api error occurred";
1741        assert_eq!(
1742            detect_agent_errors_in_output(content),
1743            Some(AgentErrorKind::InvalidResponse)
1744        );
1745    }
1746
1747    #[test]
1748    fn test_detect_agent_errors_rate_limit() {
1749        // "rate limit" should be detected as InvalidResponse
1750        let content = "rate limit exceeded";
1751        assert_eq!(
1752            detect_agent_errors_in_output(content),
1753            Some(AgentErrorKind::InvalidResponse)
1754        );
1755    }
1756
1757    #[test]
1758    fn test_detect_agent_errors_service_unavailable() {
1759        // "service unavailable" should be detected as InvalidResponse
1760        let content = "service unavailable, try again later";
1761        assert_eq!(
1762            detect_agent_errors_in_output(content),
1763            Some(AgentErrorKind::InvalidResponse)
1764        );
1765    }
1766
1767    #[test]
1768    fn test_validate_rejects_prompt_too_long() {
1769        // Validation should reject "Prompt is too long" messages
1770        let result = validate_commit_message("Prompt is too long");
1771        assert!(result.is_err());
1772        assert!(result.unwrap_err().contains("agent error"));
1773    }
1774
1775    #[test]
1776    fn test_validate_rejects_token_limit_exceeded() {
1777        // Validation should reject "token limit exceeded" messages
1778        let result = validate_commit_message("token limit exceeded");
1779        assert!(result.is_err());
1780        assert!(result.unwrap_err().contains("agent error"));
1781    }
1782
1783    #[test]
1784    fn test_validate_rejects_context_length() {
1785        // Validation should reject "context length exceeded" messages
1786        // Note: "error: context length exceeded" starts with "error:" which is caught by error_markers first
1787        // So we use a message that doesn't start with "error:" to test agent_error_patterns
1788        let result = validate_commit_message("The context length exceeded for this model");
1789        assert!(result.is_err());
1790        assert!(result.unwrap_err().contains("agent error"));
1791    }
1792
1793    #[test]
1794    fn test_validate_accepts_valid_message_with_error_words() {
1795        // Valid commit message containing words like "error" in a different context should pass
1796        // For example, "fix: resolve parsing error" is valid
1797        let result = validate_commit_message("fix(parser): resolve parsing error");
1798        assert!(result.is_ok());
1799    }
1800
1801    #[test]
1802    fn test_validate_rejects_json_artifacts_with_escape_sequences() {
1803        // Validation should reject JSON artifacts (happens before combined check)
1804        let result = validate_commit_message(r#"feat: add feature{"type":"result"}\\nBody text"#);
1805        assert!(result.is_err());
1806        // The JSON artifacts check runs first, so it reports JSON artifacts
1807        assert!(result.unwrap_err().contains("JSON artifacts"));
1808    }
1809
1810    #[test]
1811    fn test_validate_rejects_json_artifacts_without_escape_sequences() {
1812        // Even without escape sequences, JSON artifacts should be rejected
1813        let result = validate_commit_message(r#"feat: add feature{"type":"result"}Body text"#);
1814        assert!(result.is_err());
1815        assert!(result.unwrap_err().contains("JSON artifacts"));
1816    }
1817
1818    #[test]
1819    fn test_validate_accepts_literal_escape_without_json_artifacts() {
1820        // Validation should accept literal \n when no JSON artifacts are present
1821        // This is legitimate content (e.g., "fix: handle \n in filenames")
1822        let result = validate_commit_message("feat: add feature\\nBody text");
1823        assert!(result.is_ok());
1824    }
1825
1826    #[test]
1827    fn test_validate_accepts_literal_tab_without_json_artifacts() {
1828        // Validation should accept literal \t when no JSON artifacts are present
1829        let result = validate_commit_message("feat: add feature\\t- bullet");
1830        assert!(result.is_ok());
1831    }
1832
1833    #[test]
1834    fn test_validate_accepts_actual_newlines() {
1835        // Validation should accept actual newlines (not literal escape sequences)
1836        let result = validate_commit_message("feat: add feature\n\nBody text here");
1837        assert!(result.is_ok());
1838    }
1839
1840    // =========================================================================
1841    // Tests for enhanced escape sequence validation (Step 1 improvements)
1842    // =========================================================================
1843
1844    #[test]
1845    fn test_validate_rejects_body_starts_with_literal_newline_sequences() {
1846        // Validation should reject when body starts with literal \n\n after subject
1847        // This happens when JSON like {"subject": "feat", "body": "\\n\\ntext"}
1848        // is parsed but not unescaped - the body value contains literal \n\n
1849        // The test input has an actual newline after the subject, then literal \\n\\n
1850        let result = validate_commit_message("feat: add feature\n\\n\\nBody text here");
1851        assert!(result.is_err());
1852        assert!(result.unwrap_err().contains("literal escape sequences"));
1853    }
1854
1855    #[test]
1856    fn test_validate_rejects_body_second_line_is_literal_escape() {
1857        // Validation should reject when second line is literally "\\n"
1858        let result = validate_commit_message("feat: add feature\n\\n");
1859        assert!(result.is_err());
1860        assert!(result.unwrap_err().contains("literal escape sequences"));
1861    }
1862
1863    #[test]
1864    fn test_validate_rejects_body_second_line_is_double_literal_escape() {
1865        // Validation should reject when second line is literally "\\n\\n"
1866        let result = validate_commit_message("feat: add feature\n\\n\\n");
1867        assert!(result.is_err());
1868        assert!(result.unwrap_err().contains("literal escape sequences"));
1869    }
1870
1871    #[test]
1872    fn test_validate_rejects_repeated_literal_escape_sequences() {
1873        // Validation should reject repeated literal \\n\\n\\n patterns
1874        let result = validate_commit_message("feat: add feature\\n\\n\\nBody text");
1875        assert!(result.is_err());
1876        assert!(result
1877            .unwrap_err()
1878            .contains("repeated literal escape sequences"));
1879    }
1880
1881    #[test]
1882    fn test_validate_rejects_quadruple_literal_escape_sequences() {
1883        // Validation should reject \\n\\n\\n\\n patterns
1884        let result = validate_commit_message("feat: add feature\\n\\n\\n\\nBody text");
1885        assert!(result.is_err());
1886        assert!(result
1887            .unwrap_err()
1888            .contains("repeated literal escape sequences"));
1889    }
1890
1891    #[test]
1892    fn test_validate_accepts_legitimate_single_escape_in_middle() {
1893        // Validation should accept single \\n in middle of text (legitimate content)
1894        let result = validate_commit_message("feat: handle backslash-n in parser");
1895        assert!(result.is_ok());
1896    }
1897
1898    #[test]
1899    fn test_validate_accepts_body_with_actual_newlines() {
1900        // Validation should accept actual newlines in body
1901        let result =
1902            validate_commit_message("feat: add feature\n\nThis is the body\nwith multiple lines");
1903        assert!(result.is_ok());
1904    }
1905
1906    // =========================================================================
1907    // Tests for CommitExtractionResult::AgentError variant
1908    // =========================================================================
1909
1910    #[test]
1911    fn test_commit_extraction_result_agent_error() {
1912        // Test AgentError variant methods
1913        let result = CommitExtractionResult::AgentError(AgentErrorKind::TokenExhausted);
1914
1915        assert!(result.is_agent_error());
1916        assert!(!result.is_fallback());
1917        assert_eq!(result.error_kind(), Some(AgentErrorKind::TokenExhausted));
1918        assert_eq!(result.into_message(), String::new());
1919    }
1920
1921    #[test]
1922    fn test_commit_extraction_result_extracted_not_agent_error() {
1923        // Test that Extracted variant is not an agent error
1924        let result = CommitExtractionResult::Extracted("feat: add feature".to_string());
1925
1926        assert!(!result.is_agent_error());
1927        assert!(!result.is_fallback());
1928        assert_eq!(result.error_kind(), None);
1929        assert_eq!(result.into_message(), "feat: add feature");
1930    }
1931
1932    // =========================================================================
1933    // Tests for format_structured_commit with escaped sequences
1934    // =========================================================================
1935
1936    #[test]
1937    fn test_format_structured_commit_unescapes_body_newlines() {
1938        // Test that format_structured_commit properly unescapes \n in body
1939        let msg = StructuredCommitMessage {
1940            subject: "feat: add feature".to_string(),
1941            body: Some("Line 1\\nLine 2\\nLine 3".to_string()),
1942        };
1943        let result = format_structured_commit(&msg);
1944        assert!(result.is_some());
1945        let formatted = result.unwrap();
1946        assert!(formatted.contains("Line 1\nLine 2\nLine 3"));
1947        assert!(!formatted.contains("\\n"));
1948    }
1949
1950    #[test]
1951    fn test_format_structured_commit_unescapes_subject_newlines() {
1952        // Test that format_structured_commit properly unescapes \n in subject
1953        // Note: After unescaping, "feat: add\nfeature" contains an actual newline.
1954        // The is_conventional_commit_subject check only validates the prefix (feat:),
1955        // so this passes validation. The resulting commit message would have an embedded
1956        // newline in the subject, which is unusual but technically passes the checks.
1957        let msg = StructuredCommitMessage {
1958            subject: "feat: add\\nfeature".to_string(),
1959            body: None,
1960        };
1961        let result = format_structured_commit(&msg);
1962        // The result is Some because "feat:" is a valid type prefix
1963        assert!(result.is_some());
1964        // The subject has been unescaped, so it contains an actual newline
1965        assert!(result.unwrap().contains('\n'));
1966    }
1967
1968    #[test]
1969    fn test_format_structured_commit_with_empty_body() {
1970        // Test that format_structured_commit works with empty body
1971        let msg = StructuredCommitMessage {
1972            subject: "fix: resolve bug".to_string(),
1973            body: None,
1974        };
1975        let result = format_structured_commit(&msg);
1976        assert_eq!(result, Some("fix: resolve bug".to_string()));
1977    }
1978
1979    #[test]
1980    fn test_format_structured_commit_with_body_containing_tabs() {
1981        // Test that format_structured_commit properly unescapes \t in body
1982        let msg = StructuredCommitMessage {
1983            subject: "feat: add feature".to_string(),
1984            body: Some("- item 1\\t- item 2".to_string()),
1985        };
1986        let result = format_structured_commit(&msg);
1987        assert!(result.is_some());
1988        let formatted = result.unwrap();
1989        assert!(formatted.contains("- item 1\t- item 2"));
1990        assert!(!formatted.contains("\\t"));
1991    }
1992
1993    // =========================================================================
1994    // Tests for render_final_commit_message
1995    // =========================================================================
1996
1997    #[test]
1998    fn test_render_final_commit_message_with_literal_escapes() {
1999        // Test that render_final_commit_message cleans up escape sequences
2000        // Note: whitespace cleanup removes blank lines
2001        let input = "feat: add feature\n\\n\\nBody with literal escapes";
2002        let result = render_final_commit_message(input);
2003        assert_eq!(result, "feat: add feature\nBody with literal escapes");
2004    }
2005
2006    #[test]
2007    fn test_render_final_commit_message_already_clean() {
2008        // Test that already-clean messages pass through (whitespace cleanup applied)
2009        let input = "feat: add feature\n\nBody text here";
2010        let result = render_final_commit_message(input);
2011        assert_eq!(result, "feat: add feature\nBody text here");
2012    }
2013
2014    #[test]
2015    fn test_render_final_commit_message_with_tabs() {
2016        // Test that tab escapes are properly handled
2017        let input = "feat: add feature\\n\\t- item 1\\n\\t- item 2";
2018        let result = render_final_commit_message(input);
2019        // Tabs are stripped by whitespace cleanup (trim() removes leading whitespace)
2020        assert_eq!(result, "feat: add feature\n- item 1\n- item 2");
2021    }
2022
2023    #[test]
2024    fn test_render_final_commit_message_with_carriage_returns() {
2025        // Test that carriage return escapes are properly handled
2026        let input = "feat: add feature\\r\\nBody text";
2027        let result = render_final_commit_message(input);
2028        // Carriage returns are converted, but whitespace cleanup removes extra blank lines
2029        assert_eq!(result, "feat: add feature\nBody text");
2030    }
2031
2032    #[test]
2033    fn test_render_final_commit_message_double_escaped() {
2034        // Test that double-escaped sequences are handled
2035        let input = "feat: add feature\n\\\\n\\\\nDouble escaped";
2036        let result = render_final_commit_message(input);
2037        // Double backslash-n becomes backslash-n (literal backslash + n) after cleanup
2038        // The whitespace cleanup then removes the blank lines
2039        assert_eq!(result, "feat: add feature\n\\\n\\\nDouble escaped");
2040    }
2041
2042    #[test]
2043    fn test_render_final_commit_message_whitespace_cleanup() {
2044        // Test that trailing empty lines are removed
2045        let input = "feat: add feature\n\nBody text\n\n\n  \n  ";
2046        let result = render_final_commit_message(input);
2047        assert_eq!(result, "feat: add feature\nBody text");
2048    }
2049
2050    #[test]
2051    fn test_render_final_commit_message_mixed_escape_sequences() {
2052        // Test handling of mixed escape sequences
2053        let input = "feat: add feature\\n\\nDetails:\\r\\n\\t- item 1\\n\\t- item 2";
2054        let result = render_final_commit_message(input);
2055        // Carriage returns normalized to newlines, tabs stripped by trim, blank lines removed
2056        assert_eq!(result, "feat: add feature\nDetails:\n- item 1\n- item 2");
2057    }
2058
2059    #[test]
2060    fn test_render_final_commit_message_trailing_whitespace_lines() {
2061        // Test that empty lines with only whitespace are cleaned up
2062        let input = "feat: add feature\n\\n\\n  Body with spaces  \\n  \\n  ";
2063        let result = render_final_commit_message(input);
2064        // Whitespace cleanup removes blank lines and trims each line
2065        assert_eq!(result, "feat: add feature\nBody with spaces");
2066    }
2067
2068    // =========================================================================
2069    // Tests for try_extract_structured_commit
2070    // =========================================================================
2071
2072    #[test]
2073    fn test_try_extract_structured_commit_direct_json() {
2074        // Test that direct JSON with subject and body is extracted correctly
2075        let json = r#"{"subject":"fix(commit): try simpler prompts after agent errors","body":"When all agents fail for a prompt variant, keep iterating through progressively simpler prompt strategies instead of aborting the retry loop."}"#;
2076        let result = try_extract_structured_commit_with_trace(json).0;
2077        assert!(result.is_some(), "Should extract commit from direct JSON");
2078        let msg = result.unwrap();
2079        assert!(msg.starts_with("fix(commit):"), "Should start with type");
2080        assert!(msg.contains("try simpler prompts after agent errors"));
2081        assert!(msg.contains("When all agents fail"));
2082    }
2083
2084    #[test]
2085    fn test_try_extract_structured_commit_json_no_body() {
2086        // Test JSON with subject only
2087        let json = r#"{"subject":"feat: add new feature"}"#;
2088        let result = try_extract_structured_commit_with_trace(json).0;
2089        assert!(result.is_some());
2090        assert_eq!(result.unwrap(), "feat: add new feature");
2091    }
2092
2093    #[test]
2094    fn test_try_extract_structured_commit_code_fence() {
2095        // Test JSON inside markdown code fence
2096        let content = r#"Here is the commit message:
2097```json
2098{"subject":"fix: resolve bug","body":"Details about the fix."}
2099```
2100"#;
2101        let result = try_extract_structured_commit_with_trace(content).0;
2102        assert!(result.is_some());
2103        let msg = result.unwrap();
2104        assert!(msg.starts_with("fix: resolve bug"));
2105        assert!(msg.contains("Details about the fix"));
2106    }
2107
2108    #[test]
2109    fn test_try_extract_structured_commit_with_preamble() {
2110        // Test JSON with some preamble text
2111        let content = r#"Based on the diff, here is my commit:
2112{"subject":"refactor: simplify logic","body":"Removed unnecessary complexity."}"#;
2113        let result = try_extract_structured_commit_with_trace(content).0;
2114        assert!(result.is_some());
2115        let msg = result.unwrap();
2116        assert!(msg.starts_with("refactor:"));
2117    }
2118
2119    #[test]
2120    fn test_try_extract_structured_commit_invalid_type() {
2121        // Test JSON with invalid conventional commit type
2122        let json = r#"{"subject":"invalid: not a real type","body":"Body"}"#;
2123        let result = try_extract_structured_commit_with_trace(json).0;
2124        assert!(result.is_none(), "Should reject invalid commit type");
2125    }
2126
2127    #[test]
2128    fn test_try_extract_structured_commit_from_ndjson() {
2129        // Test extraction from NDJSON stream with result type
2130        let ndjson = r#"{"type":"stream_event","data":"..."}
2131{"type":"result","result":"{\"subject\":\"docs: update readme\",\"body\":\"Add usage examples.\"}"}
2132"#;
2133        let result = try_extract_structured_commit_with_trace(ndjson).0;
2134        assert!(result.is_some(), "Should extract from NDJSON result field");
2135        let msg = result.unwrap();
2136        assert!(msg.starts_with("docs: update readme"));
2137    }
2138
2139    #[test]
2140    fn test_try_extract_structured_commit_from_ndjson_with_markdown_fence() {
2141        // Test extraction from NDJSON stream where result contains markdown with JSON code fence
2142        // This is the format used by PROMPT-LOG5.log (GLM-4.7 with markdown JSON)
2143        let ndjson = r#"{"type":"stream_event","data":"..."}
2144{"type":"result","result":"The changes look clean. Now I'll generate the commit message:\n\n```json\n{\n  \"subject\": \"refactor(review): pass diff directly to all review prompts\",\n  \"body\": \"Previously, review prompts would tell agents to run git commands to\\nfetch the diff. This change:\\n\\n1. Fetches the diff once at the start of build_review_prompt\\n2. Passes it directly to all review prompt functions\"\n}\n```"}
2145"#;
2146        let result = try_extract_structured_commit_with_trace(ndjson).0;
2147        assert!(
2148            result.is_some(),
2149            "Should extract from NDJSON result field with markdown code fence"
2150        );
2151        let msg = result.unwrap();
2152        assert!(msg.starts_with("refactor(review):"));
2153        assert!(msg.contains("pass diff directly"));
2154    }
2155
2156    // =========================================================================
2157    // Tests for validate_commit_message - JSON artifact detection
2158    // =========================================================================
2159
2160    #[test]
2161    fn test_validate_commit_message_raw_json_structure() {
2162        // Test that raw JSON commit structure is rejected
2163        let raw_json = r#"{"subject":"fix: something","body":"Details"}"#;
2164        let result = validate_commit_message(raw_json);
2165        assert!(result.is_err(), "Raw JSON should be rejected");
2166        assert!(
2167            result.unwrap_err().contains("JSON"),
2168            "Error should mention JSON"
2169        );
2170    }
2171
2172    #[test]
2173    fn test_validate_commit_message_json_with_subject_key() {
2174        // Regression test: {"subject":...} pattern should be detected as JSON artifact
2175        let bad_msg = r#"{"subject":"feat: add feature","body":"Some body"}"#;
2176        let result = validate_commit_message(bad_msg);
2177        assert!(
2178            result.is_err(),
2179            "Commit message containing {{\"subject\":}} should be rejected"
2180        );
2181    }
2182
2183    // =========================================================================
2184    // Tests for XML extraction (try_extract_xml_commit)
2185    // =========================================================================
2186
2187    #[test]
2188    fn test_xml_extract_basic_subject_only() {
2189        // Test basic XML extraction with subject only
2190        let content = r"<ralph-commit>
2191<ralph-subject>feat: add new feature</ralph-subject>
2192</ralph-commit>";
2193        let result = try_extract_xml_commit_with_trace(content).0;
2194        assert!(result.is_some(), "Should extract from basic XML");
2195        assert_eq!(result.unwrap(), "feat: add new feature");
2196    }
2197
2198    #[test]
2199    fn test_xml_extract_with_body() {
2200        // Test XML extraction with subject and body
2201        let content = r"<ralph-commit>
2202<ralph-subject>feat(auth): add OAuth2 login flow</ralph-subject>
2203<ralph-body>Implement Google and GitHub OAuth providers.
2204Add session management for OAuth tokens.</ralph-body>
2205</ralph-commit>";
2206        let result = try_extract_xml_commit_with_trace(content).0;
2207        assert!(result.is_some(), "Should extract from XML with body");
2208        let msg = result.unwrap();
2209        assert!(msg.starts_with("feat(auth): add OAuth2 login flow"));
2210        assert!(msg.contains("Implement Google and GitHub OAuth providers"));
2211        assert!(msg.contains("Add session management"));
2212    }
2213
2214    #[test]
2215    fn test_xml_extract_with_empty_body() {
2216        // Test XML extraction with empty body tags
2217        let content = r"<ralph-commit>
2218<ralph-subject>fix: resolve bug</ralph-subject>
2219<ralph-body></ralph-body>
2220</ralph-commit>";
2221        let result = try_extract_xml_commit_with_trace(content).0;
2222        assert!(result.is_some(), "Should extract even with empty body");
2223        // Empty body should be treated as no body
2224        assert_eq!(result.unwrap(), "fix: resolve bug");
2225    }
2226
2227    #[test]
2228    fn test_xml_extract_ignores_preamble() {
2229        // Test that content before <ralph-commit> is ignored
2230        let content = r"Here is the commit message based on my analysis:
2231
2232Looking at the diff, I can see...
2233
2234<ralph-commit>
2235<ralph-subject>refactor: simplify logic</ralph-subject>
2236</ralph-commit>
2237
2238That's all!";
2239        let result = try_extract_xml_commit_with_trace(content).0;
2240        assert!(result.is_some(), "Should ignore preamble and extract XML");
2241        assert_eq!(result.unwrap(), "refactor: simplify logic");
2242    }
2243
2244    #[test]
2245    fn test_xml_extract_fails_missing_tags() {
2246        // Test that extraction fails when tags are missing
2247        let content = "Just some text without XML tags";
2248        let result = try_extract_xml_commit_with_trace(content).0;
2249        assert!(result.is_none(), "Should fail when XML tags are missing");
2250    }
2251
2252    #[test]
2253    fn test_xml_extract_fails_invalid_commit_type() {
2254        // Test that extraction fails for invalid conventional commit types
2255        let content = r"<ralph-commit>
2256<ralph-subject>invalid: not a real type</ralph-subject>
2257</ralph-commit>";
2258        let result = try_extract_xml_commit_with_trace(content).0;
2259        assert!(result.is_none(), "Should reject invalid commit type");
2260    }
2261
2262    #[test]
2263    fn test_xml_extract_fails_missing_subject() {
2264        // Test that extraction fails when subject is missing
2265        let content = r"<ralph-commit>
2266<ralph-body>Just a body, no subject</ralph-body>
2267</ralph-commit>";
2268        let result = try_extract_xml_commit_with_trace(content).0;
2269        assert!(result.is_none(), "Should fail when subject is missing");
2270    }
2271
2272    #[test]
2273    fn test_xml_extract_fails_empty_subject() {
2274        // Test that extraction fails when subject is empty
2275        let content = r"<ralph-commit>
2276<ralph-subject></ralph-subject>
2277</ralph-commit>";
2278        let result = try_extract_xml_commit_with_trace(content).0;
2279        assert!(result.is_none(), "Should fail when subject is empty");
2280    }
2281
2282    #[test]
2283    fn test_xml_extract_handles_whitespace_in_subject() {
2284        // Test that whitespace around subject is trimmed
2285        let content = r"<ralph-commit>
2286<ralph-subject>   docs: update readme   </ralph-subject>
2287</ralph-commit>";
2288        let result = try_extract_xml_commit_with_trace(content).0;
2289        assert!(result.is_some(), "Should handle whitespace in subject");
2290        assert_eq!(result.unwrap(), "docs: update readme");
2291    }
2292
2293    #[test]
2294    fn test_xml_extract_with_breaking_change() {
2295        // Test XML extraction with breaking change indicator
2296        let content = r"<ralph-commit>
2297<ralph-subject>feat!: drop Python 3.7 support</ralph-subject>
2298<ralph-body>BREAKING CHANGE: Minimum Python version is now 3.8.</ralph-body>
2299</ralph-commit>";
2300        let result = try_extract_xml_commit_with_trace(content).0;
2301        assert!(result.is_some(), "Should handle breaking change indicator");
2302        let msg = result.unwrap();
2303        assert!(msg.starts_with("feat!:"));
2304        assert!(msg.contains("BREAKING CHANGE"));
2305    }
2306
2307    #[test]
2308    fn test_xml_extract_with_scope() {
2309        // Test XML extraction with scope
2310        let content = r"<ralph-commit>
2311<ralph-subject>test(parser): add coverage for edge cases</ralph-subject>
2312</ralph-commit>";
2313        let result = try_extract_xml_commit_with_trace(content).0;
2314        assert!(result.is_some(), "Should handle scope in subject");
2315        assert_eq!(result.unwrap(), "test(parser): add coverage for edge cases");
2316    }
2317
2318    #[test]
2319    fn test_xml_extract_body_preserves_newlines() {
2320        // Test that newlines in body are preserved
2321        let content = r"<ralph-commit>
2322<ralph-subject>feat: add feature</ralph-subject>
2323<ralph-body>Line 1
2324Line 2
2325Line 3</ralph-body>
2326</ralph-commit>";
2327        let result = try_extract_xml_commit_with_trace(content).0;
2328        assert!(result.is_some(), "Should preserve newlines in body");
2329        let msg = result.unwrap();
2330        assert!(msg.contains("Line 1\nLine 2\nLine 3"));
2331    }
2332
2333    #[test]
2334    fn test_xml_extract_fails_malformed_tags() {
2335        // Test that extraction fails for malformed tags (end before start)
2336        let content = r"</ralph-commit>
2337<ralph-subject>feat: add feature</ralph-subject>
2338<ralph-commit>";
2339        let result = try_extract_xml_commit_with_trace(content).0;
2340        assert!(result.is_none(), "Should fail for malformed tags");
2341    }
2342
2343    #[test]
2344    fn test_xml_extract_handles_markdown_code_fence() {
2345        // Test that XML inside markdown code fence is NOT extracted
2346        // (the XML tags should be directly in the output, not wrapped)
2347        let content = r"```xml
2348<ralph-commit>
2349<ralph-subject>feat: add feature</ralph-subject>
2350</ralph-commit>
2351```";
2352        // The XML extractor looks for tags directly, so this should still work
2353        // since the tags are present in the content
2354        let result = try_extract_xml_commit_with_trace(content).0;
2355        assert!(
2356            result.is_some(),
2357            "Should extract from XML even inside code fence"
2358        );
2359    }
2360
2361    // Tests for placeholder validation with technical contexts
2362    #[test]
2363    fn test_validate_accepts_template_placeholders() {
2364        // "template placeholders" should be accepted as technical context
2365        let result = validate_commit_message("feat: substitute template placeholders in config");
2366        assert!(
2367            result.is_ok(),
2368            "Should accept 'template placeholders' as technical context"
2369        );
2370    }
2371
2372    #[test]
2373    fn test_validate_accepts_template_placeholder() {
2374        // "template placeholder" (singular) should be accepted
2375        let result = validate_commit_message("fix: update template placeholder handling");
2376        assert!(
2377            result.is_ok(),
2378            "Should accept 'template placeholder' as technical context"
2379        );
2380    }
2381
2382    #[test]
2383    fn test_validate_accepts_placeholder_variable() {
2384        // "placeholder variable" should be accepted
2385        let result = validate_commit_message("refactor: rename placeholder variable in template");
2386        assert!(
2387            result.is_ok(),
2388            "Should accept 'placeholder variable' as technical context"
2389        );
2390    }
2391
2392    #[test]
2393    fn test_validate_accepts_placeholder_variables() {
2394        // "placeholder variables" (plural) should be accepted
2395        let result = validate_commit_message("docs: document placeholder variables usage");
2396        assert!(
2397            result.is_ok(),
2398            "Should accept 'placeholder variables' as technical context"
2399        );
2400    }
2401
2402    #[test]
2403    fn test_validate_accepts_placeholder_value() {
2404        // "placeholder value" should be accepted
2405        let result = validate_commit_message("fix: set default placeholder value");
2406        assert!(
2407            result.is_ok(),
2408            "Should accept 'placeholder value' as technical context"
2409        );
2410    }
2411
2412    #[test]
2413    fn test_validate_accepts_placeholder_values() {
2414        // "placeholder values" (plural) should be accepted
2415        let result = validate_commit_message("feat: add support for placeholder values");
2416        assert!(
2417            result.is_ok(),
2418            "Should accept 'placeholder values' as technical context"
2419        );
2420    }
2421
2422    #[test]
2423    fn test_validate_accepts_substitute_placeholder() {
2424        // Context around "substitute" and "placeholder" should be accepted
2425        let result = validate_commit_message("fix: properly substitute placeholder in output");
2426        assert!(
2427            result.is_ok(),
2428            "Should accept 'substitute placeholder' as technical context"
2429        );
2430    }
2431
2432    #[test]
2433    fn test_validate_accepts_substituting_placeholder() {
2434        // "substituting placeholder" should be accepted
2435        let result =
2436            validate_commit_message("refactor: add logic for substituting placeholder tokens");
2437        assert!(
2438            result.is_ok(),
2439            "Should accept 'substituting placeholder' as technical context"
2440        );
2441    }
2442
2443    #[test]
2444    fn test_validate_accepts_replace_placeholder() {
2445        // "replace placeholder" should be accepted
2446        let result = validate_commit_message("feat: replace placeholder with actual value");
2447        assert!(
2448            result.is_ok(),
2449            "Should accept 'replace placeholder' as technical context"
2450        );
2451    }
2452
2453    #[test]
2454    fn test_validate_rejects_actual_placeholder_filler() {
2455        // Actual filler patterns should still be rejected
2456        let result = validate_commit_message("feat: [placeholder]");
2457        assert!(
2458            result.is_err(),
2459            "Should reject '[placeholder]' filler pattern"
2460        );
2461    }
2462
2463    #[test]
2464    fn test_validate_rejects_placeholder_for() {
2465        // "placeholder for" should be rejected
2466        let result = validate_commit_message("feat: this is a placeholder for the real thing");
2467        assert!(
2468            result.is_err(),
2469            "Should reject 'placeholder for' as filler pattern"
2470        );
2471    }
2472
2473    #[test]
2474    fn test_validate_rejects_is_a_placeholder() {
2475        // "is a placeholder" should be rejected
2476        let result = validate_commit_message("feat: this text is a placeholder");
2477        assert!(
2478            result.is_err(),
2479            "Should reject 'is a placeholder' as filler pattern"
2480        );
2481    }
2482
2483    #[test]
2484    fn test_validate_rejects_bare_placeholder() {
2485        // Bare "placeholder" without technical context should be rejected
2486        let result = validate_commit_message("feat: add placeholder support");
2487        assert!(
2488            result.is_err(),
2489            "Should reject bare 'placeholder' without technical context"
2490        );
2491    }
2492
2493    #[test]
2494    fn test_validate_accepts_remove_placeholder_from_ui() {
2495        // "remove placeholder from UI" should be accepted - it's a real commit about placeholders
2496        let result = validate_commit_message("fix: remove placeholder from UI");
2497        assert!(
2498            result.is_ok(),
2499            "Should accept 'remove placeholder from UI' as it describes a real change to placeholder functionality"
2500        );
2501    }
2502
2503    #[test]
2504    fn test_validate_accepts_delete_placeholder() {
2505        // "delete placeholder" should be accepted
2506        let result = validate_commit_message("refactor: delete placeholder from login form");
2507        assert!(
2508            result.is_ok(),
2509            "Should accept 'delete placeholder' as it describes a real change"
2510        );
2511    }
2512
2513    #[test]
2514    fn test_validate_accepts_placeholder_at_beginning() {
2515        // "placeholder" at the beginning with valid context should be accepted
2516        let result = validate_commit_message("placeholder attribute added to input field");
2517        assert!(
2518            result.is_ok(),
2519            "Should accept 'placeholder' at beginning with valid technical context"
2520        );
2521    }
2522
2523    #[test]
2524    fn test_validate_accepts_placeholder_at_end() {
2525        // "placeholder" at the end with valid context should be accepted
2526        let result = validate_commit_message("fix: clear input placeholder");
2527        assert!(
2528            result.is_ok(),
2529            "Should accept 'placeholder' at end with valid action"
2530        );
2531    }
2532}
ralph_workflow/files/llm_output_extraction/commit.rs

ralph_workflow/files/llm_output_extraction/
commit.rs