use std::sync::LazyLock;
use regex::Regex;
use serde::{Deserialize, Serialize};
use crate::config::CommitFormat;
use crate::domain::CommitType;
use crate::error::{Error, Result};
#[derive(Debug, Deserialize, Serialize)]
pub struct StructuredCommit {
#[serde(rename = "type")]
pub commit_type: String,
pub scope: Option<String>,
pub subject: String,
pub body: Option<String>,
pub breaking_change: Option<String>, }
static SCOPE_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^[a-z0-9][a-z0-9\-_/.]*$").unwrap());
static CODE_FENCE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"```[\s\S]*?```").unwrap());
static THOUGHT_BLOCK_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?s)<(?:thought|think)>[\s\S]*?</(?:thought|think)>").unwrap());
static UNCLOSED_THOUGHT_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?i)^\s*<(?:thought|think)>\s*").unwrap());
static VALID_TYPE_START_REGEX: LazyLock<Regex> = LazyLock::new(|| {
let types = CommitType::ALL.join("|");
Regex::new(&format!(r"(?m)(?:^|\s)({})(?:\(|!|:)", types)).unwrap()
});
static PREAMBLE_PATTERNS: &[&str] = &[
"here's the commit message",
"here is the commit message",
"commit message:",
"suggested commit:",
];
pub struct CommitSanitizer;
impl CommitSanitizer {
fn wrap_body(text: &str, max_width: usize) -> String {
let mut result = String::new();
for (i, paragraph) in text.split('\n').enumerate() {
if i > 0 {
result.push('\n');
}
let trimmed = paragraph.trim();
if trimmed.is_empty() {
continue;
}
let mut line_len = 0;
for (j, word) in trimmed.split_whitespace().enumerate() {
let word_len = word.chars().count();
if j == 0 {
result.push_str(word);
line_len = word_len;
} else if line_len + 1 + word_len > max_width {
result.push('\n');
result.push_str(word);
line_len = word_len;
} else {
result.push(' ');
result.push_str(word);
line_len += 1 + word_len;
}
}
}
result
}
fn format_breaking_footer(desc: &str) -> String {
const PREFIX: &str = "BREAKING CHANGE: ";
const FIRST_LINE_BUDGET: usize = 72 - PREFIX.len();
let wrapped = Self::wrap_body(desc.trim(), FIRST_LINE_BUDGET);
let mut lines = wrapped.lines();
let first = lines.next().unwrap_or_default();
let mut footer = format!("{}{}", PREFIX, first);
for line in lines {
footer.push('\n');
footer.push_str(" ");
footer.push_str(line);
}
footer
}
pub fn sanitize(raw: &str, format: &CommitFormat) -> Result<String> {
if let Ok(structured) = Self::try_parse_json(raw) {
let msg = Self::format_structured(&structured, format)?;
return Ok(Self::strip_control_chars(&msg));
}
let cleaned = Self::clean_text(raw, format);
Self::validate_conventional(&cleaned)?;
let first_line = cleaned.lines().next().unwrap_or("");
if first_line.chars().count() > 72 {
return Err(Error::InvalidCommitMessage(format!(
"First line is {} chars (max 72): '{}'",
first_line.chars().count(),
first_line,
)));
}
Ok(Self::strip_control_chars(&cleaned))
}
fn strip_control_chars(s: &str) -> String {
s.chars()
.filter(|c| !c.is_control() || *c == '\n' || *c == '\t')
.collect()
}
fn try_parse_json(raw: &str) -> std::result::Result<StructuredCommit, ()> {
let stripped = THOUGHT_BLOCK_REGEX.replace_all(raw, "");
let stripped = UNCLOSED_THOUGHT_REGEX.replace(&stripped, "");
let trimmed = stripped.trim();
if let Some(type_key_pos) = trimmed.find("type") {
if let Some(start_brace) = trimmed[..type_key_pos].rfind('{') {
let json_candidate = &trimmed[start_brace..];
if let Some(end_brace) = json_candidate.rfind('}') {
let json = &json_candidate[..=end_brace];
if let Ok(structured) = serde_json::from_str::<StructuredCommit>(json) {
return Ok(structured);
}
}
}
}
if let Some(start_brace) = trimmed.find('{') {
let json_candidate = &trimmed[start_brace..];
if let Some(end_brace) = json_candidate.rfind('}') {
let json = &json_candidate[..=end_brace];
if let Ok(structured) = serde_json::from_str::<StructuredCommit>(json) {
return Ok(structured);
}
}
}
Err(())
}
fn format_structured(s: &StructuredCommit, format: &CommitFormat) -> Result<String> {
let commit_type = s.commit_type.to_lowercase();
if !CommitType::ALL.contains(&commit_type.as_str()) {
return Err(Error::InvalidCommitMessage(format!(
"Invalid commit type: '{}'. Must be one of: {}",
commit_type,
CommitType::ALL.join(", ")
)));
}
let scope = if format.include_scope {
if let Some(ref raw_scope) = s.scope {
let sanitized = raw_scope
.to_lowercase()
.replace(' ', "-")
.replace("--", "-");
if sanitized.is_empty() {
None
} else if !SCOPE_REGEX.is_match(&sanitized) {
None
} else {
Some(sanitized)
}
} else {
None
}
} else {
None
};
let breaking_change: Option<String> = s
.breaking_change
.as_deref()
.filter(|bc| {
let t = bc.trim();
!t.is_empty() && !t.eq_ignore_ascii_case("null")
})
.map(|bc| bc.trim().to_string());
let is_breaking = breaking_change.is_some();
let subject = {
let trimmed = s.subject.trim().trim_end_matches('.');
if format.lowercase_subject {
let mut chars = trimmed.chars();
match chars.next() {
Some(first) => first.to_lowercase().chain(chars).collect(),
None => String::new(),
}
} else {
trimmed.to_string()
}
};
let bang = if is_breaking { "!" } else { "" };
let first_line = match scope {
Some(ref sc) => format!("{}({}){}: {}", commit_type, sc, bang, subject),
None => format!("{}{}: {}", commit_type, bang, subject),
};
if first_line.chars().count() > 72 {
return Err(Error::InvalidCommitMessage(format!(
"First line is {} chars (max 72): '{}'",
first_line.chars().count(),
first_line,
)));
}
let body_section: Option<String> = if format.include_body {
match &s.body {
Some(body) if !body.trim().is_empty() => Some(Self::wrap_body(body.trim(), 72)),
_ => None,
}
} else {
None
};
let footer_section: Option<String> =
breaking_change.as_deref().map(Self::format_breaking_footer);
let message = match (body_section, footer_section) {
(Some(body), Some(footer)) => format!("{}\n\n{}\n\n{}", first_line, body, footer),
(Some(body), None) => format!("{}\n\n{}", first_line, body),
(None, Some(footer)) => format!("{}\n\n{}", first_line, footer),
(None, None) => first_line,
};
Ok(message)
}
fn clean_text(raw: &str, format: &CommitFormat) -> String {
let mut cleaned = raw.to_string();
cleaned = THOUGHT_BLOCK_REGEX.replace_all(&cleaned, "").to_string();
cleaned = UNCLOSED_THOUGHT_REGEX.replace(&cleaned, "").to_string();
cleaned = CODE_FENCE_REGEX.replace_all(&cleaned, "").to_string();
if let Some(mat) = VALID_TYPE_START_REGEX.find(&cleaned) {
cleaned = cleaned[mat.start()..].to_string();
}
cleaned = cleaned.trim().to_string();
if cleaned.starts_with('"') && cleaned.ends_with('"') && cleaned.len() >= 2 {
cleaned = cleaned[1..cleaned.len() - 1].to_string();
}
if cleaned.starts_with('\'') && cleaned.ends_with('\'') && cleaned.len() >= 2 {
cleaned = cleaned[1..cleaned.len() - 1].to_string();
}
for pattern in PREAMBLE_PATTERNS {
let lower = cleaned.to_lowercase();
if let Some(pos) = lower.find(pattern) {
let after = &cleaned[pos + pattern.len()..];
cleaned = after.trim_start_matches(':').trim().to_string();
}
}
if format.lowercase_subject
&& let Some(colon_pos) = cleaned.find(": ")
{
let (prefix, rest) = cleaned.split_at(colon_pos + 2);
let mut chars = rest.chars();
if let Some(first) = chars.next() {
let lowered: String = first.to_lowercase().chain(chars).collect();
cleaned = format!("{}{}", prefix, lowered);
}
}
cleaned
}
fn validate_conventional(message: &str) -> Result<()> {
let first_line = message.lines().next().unwrap_or("");
let has_valid_type = CommitType::ALL.iter().any(|t| {
first_line.starts_with(&format!("{}:", t)) || first_line.starts_with(&format!("{}(", t)) || first_line.starts_with(&format!("{}!", t)) });
if !has_valid_type {
return Err(Error::InvalidCommitMessage(format!(
"Message doesn't start with a valid type. Got: '{}'",
first_line.chars().take(20).collect::<String>()
)));
}
Ok(())
}
#[must_use]
pub fn parse_structured(raw: &str) -> Option<StructuredCommit> {
Self::try_parse_json(raw).ok()
}
}
pub struct CommitValidator;
impl CommitValidator {
#[must_use]
pub fn validate(
commit: &StructuredCommit,
has_bug_evidence: bool,
is_mechanical: bool,
public_api_removed_count: usize,
is_dependency_only: bool,
) -> Vec<String> {
let mut violations = Vec::new();
let commit_type = commit.commit_type.to_lowercase();
if commit_type == "fix" && !has_bug_evidence {
violations.push(
"Type is \"fix\" but no bug-fix comments were found in the diff. \
Use \"refactor\" instead."
.to_string(),
);
}
if commit.breaking_change.is_none() && public_api_removed_count > 0 {
violations.push(
"Public APIs were removed but breaking_change is null. \
Describe what was removed in plain English."
.to_string(),
);
}
if let Some(ref bc) = commit.breaking_change {
let lower = bc.to_lowercase();
if lower.contains("public_api_removed")
|| lower.contains("bug_evidence")
|| lower.contains("mechanical_transform")
|| lower.contains("dependency_only")
{
violations.push(
"The breaking_change field contains internal label names. \
Describe the actual API change in plain English."
.to_string(),
);
}
}
if is_mechanical && matches!(commit_type.as_str(), "feat" | "fix") {
violations.push(
"Change is a mechanical/formatting transform but type is \"feat\"/\"fix\". \
Use \"style\" or \"refactor\"."
.to_string(),
);
}
if is_dependency_only && commit_type != "chore" {
violations.push(
"All changes are in dependency/config files but type is not \"chore\".".to_string(),
);
}
if Self::is_generic_subject(&commit.subject) {
violations.push(
"Subject is too generic — name the specific API, function, or module changed. \
Avoid vague verbs like \"update\", \"improve\", \"change\"."
.to_string(),
);
}
let subject_trimmed = commit.subject.trim().trim_end_matches('.');
let prefix_len = commit.commit_type.len()
+ commit.scope.as_ref().map(|s| s.len() + 2).unwrap_or(0)
+ if commit.breaking_change.is_some() {
1
} else {
0
}
+ 2; let first_line_len = prefix_len + subject_trimmed.chars().count();
if first_line_len > 72 {
let budget = 72_usize.saturating_sub(prefix_len);
violations.push(format!(
"Subject is {} chars but must be under {} chars \
(first line would be {} chars, max 72). Shorten it.",
subject_trimmed.chars().count(),
budget,
first_line_len,
));
}
violations
}
fn is_generic_subject(subject: &str) -> bool {
const GENERIC_VERBS: &[&str] = &["update", "improve", "change", "modify", "enhance"];
const GENERIC_NOUNS: &[&str] = &[
"code",
"things",
"stuff",
"functionality",
"logic",
"implementation",
"behavior",
"performance",
"handling",
"processing",
];
let words: Vec<&str> = subject.split_whitespace().collect();
if words.len() > 4 {
return false;
}
let lower: Vec<String> = words.iter().map(|w| w.to_lowercase()).collect();
if let Some(first) = lower.first()
&& GENERIC_VERBS.contains(&first.as_str())
{
return lower[1..]
.iter()
.any(|w| GENERIC_NOUNS.contains(&w.as_str()));
}
false
}
#[must_use]
pub fn format_corrections(violations: &[String]) -> String {
let mut section =
String::from("\nCORRECTIONS (your previous output had these errors — fix them):\n");
for v in violations {
section.push_str("- ");
section.push_str(v);
section.push('\n');
}
section
}
}