llm_git/
normalization.rs

1/// Normalization utilities for commit messages
2use unicode_normalization::UnicodeNormalization;
3
4use crate::{config::CommitConfig, types::ConventionalCommit, validation::is_past_tense_verb};
5
6/// Normalize Unicode characters to ASCII (remove AI-style formatting)
7/// Normalize Unicode characters to ASCII (remove AI-style formatting)
8pub fn normalize_unicode(text: &str) -> String {
9   // Pre-NFKD replacements for chars that decompose badly
10   // (≠ → = + combining, ½ → 1⁄2, ² → 2)
11   let pre_normalized = text
12      // Math symbols that decompose badly
13      .replace('≠', "!=") // not equal to (decomposes to = + \u{338})
14      // Fractions (NFKD decomposes ½ to 1⁄2 with fraction slash, not regular /)
15      .replace('½', "1/2")
16      .replace('¼', "1/4")
17      .replace('¾', "3/4")
18      .replace('⅓', "1/3")
19      .replace('⅔', "2/3")
20      .replace('⅕', "1/5")
21      .replace('⅖', "2/5")
22      .replace('⅗', "3/5")
23      .replace('⅘', "4/5")
24      .replace('⅙', "1/6")
25      .replace('⅚', "5/6")
26      .replace('⅛', "1/8")
27      .replace('⅜', "3/8")
28      .replace('⅝', "5/8")
29      .replace('⅞', "7/8")
30      // Superscripts (NFKD decomposes ² to just "2", losing the superscript meaning)
31      .replace('⁰', "^0")
32      .replace('¹', "^1")
33      .replace('²', "^2")
34      .replace('³', "^3")
35      .replace('⁴', "^4")
36      .replace('⁵', "^5")
37      .replace('⁶', "^6")
38      .replace('⁷', "^7")
39      .replace('⁸', "^8")
40      .replace('⁹', "^9")
41      // Subscripts
42      .replace('₀', "_0")
43      .replace('₁', "_1")
44      .replace('₂', "_2")
45      .replace('₃', "_3")
46      .replace('₄', "_4")
47      .replace('₅', "_5")
48      .replace('₆', "_6")
49      .replace('₇', "_7")
50      .replace('₈', "_8")
51      .replace('₉', "_9");
52
53   // Apply NFKD normalization for canonical decomposition
54   let normalized: String = pre_normalized.nfkd().collect();
55
56   normalized
57      // Smart quotes to straight quotes
58      .replace(['\u{2018}', '\u{2019}'], "'") // ' right single quote / apostrophe
59      .replace(['\u{201C}', '\u{201D}'], "\"") // " right double quote
60      .replace('\u{201A}', "'") // ‚ single low-9 quote
61      .replace(['\u{201E}', '\u{00AB}', '\u{00BB}'], "\"") // » right-pointing double angle quote
62      .replace(['\u{2039}', '\u{203A}'], "'") // › single right-pointing angle quote
63      // Dashes and hyphens
64      .replace(['\u{2010}', '\u{2011}', '\u{2012}'], "-") // ‒ figure dash
65      .replace(['\u{2013}', '\u{2014}', '\u{2015}'], "--") // ― horizontal bar
66      .replace('\u{2212}', "-") // − minus sign
67      // Arrows
68      .replace('\u{2192}', "->") // rightwards arrow
69      .replace('←', "<-") // leftwards arrow
70      .replace('↔', "<->") // left right arrow
71      .replace('⇒', "=>") // rightwards double arrow
72      .replace('⇐', "<=") // leftwards double arrow
73      .replace('⇔', "<=>") // left right double arrow
74      .replace('↑', "^") // upwards arrow
75      .replace('↓', "v") // downwards arrow
76      // Math symbols
77      .replace('\u{2264}', "<=") // less than or equal to
78      .replace('≥', ">=") // greater than or equal to
79      .replace('≈', "~=") // approximately equal to
80      .replace('≡', "==") // identical to
81      .replace('\u{00D7}', "x") // multiplication sign
82      .replace('÷', "/") // division sign
83      // Ellipsis
84      .replace(['\u{2026}', '⋯', '⋮'], "...") // vertical ellipsis
85      // Bullet points (convert to hyphens for consistency)
86      .replace(['•', '◦', '▪', '▫', '◆', '◇'], "-") // white diamond
87      // Check marks
88      .replace(['✓', '✔'], "v") // heavy check mark
89      .replace(['✗', '✘'], "x") // heavy ballot x
90      // Greek letters (common in programming)
91      .replace('λ', "lambda")
92      .replace('α', "alpha")
93      .replace('β', "beta")
94      .replace('γ', "gamma")
95      .replace('δ', "delta")
96      .replace('ε', "epsilon")
97      .replace('θ', "theta")
98      .replace('μ', "mu")
99      .replace('π', "pi")
100      .replace('σ', "sigma")
101      .replace('Σ', "Sigma")
102      .replace('Δ', "Delta")
103      .replace('Π', "Pi")
104      // Special spaces to regular space
105      .replace(
106         [
107            '\u{00A0}', '\u{2000}', '\u{2001}', '\u{2002}', '\u{2003}', '\u{2004}', '\u{2005}',
108            '\u{2006}', '\u{2007}', '\u{2008}', '\u{2009}', '\u{200A}', '\u{202F}', '\u{205F}',
109            '\u{3000}',
110         ],
111         " ",
112      ) // ideographic space
113      // Zero-width characters (remove)
114      .replace(['\u{200B}', '\u{200C}', '\u{200D}', '\u{FEFF}'], "") // zero-width no-break space (BOM)
115}
116
117/// Estimate token count for text (rough approximation: 1 token ≈ 4 chars)
118const fn estimate_tokens(text: &str) -> usize {
119   text.len().div_ceil(4) // Round up
120}
121
122/// Cap detail points by token budget instead of hard count
123/// Keeps high-priority details until budget exhausted
124pub fn cap_details(details: &mut Vec<String>, max_tokens: usize) {
125   if details.is_empty() {
126      return;
127   }
128
129   // Calculate total tokens
130   let total_tokens: usize = details.iter().map(|d| estimate_tokens(d)).sum();
131
132   if total_tokens <= max_tokens {
133      return; // Under budget, keep all
134   }
135
136   // Score by priority keywords and length
137   let mut scored: Vec<(usize, i32, usize, &String)> = details
138      .iter()
139      .enumerate()
140      .map(|(idx, detail)| {
141         let lower = detail.to_lowercase();
142         let mut score = 0;
143
144         // High priority keywords (security, crashes, critical bugs)
145         if lower.contains("security")
146            || lower.contains("vulnerability")
147            || lower.contains("exploit")
148            || lower.contains("critical")
149            || (lower.contains("fix") && lower.contains("crash"))
150         {
151            score += 100;
152         }
153         if lower.contains("breaking") || lower.contains("incompatible") {
154            score += 90;
155         }
156         if lower.contains("performance")
157            || lower.contains("faster")
158            || lower.contains("optimization")
159         {
160            score += 80;
161         }
162         if lower.contains("fix") || lower.contains("bug") {
163            score += 70;
164         }
165
166         // Medium priority keywords
167         if lower.contains("api") || lower.contains("interface") || lower.contains("public") {
168            score += 50;
169         }
170         if lower.contains("user") || lower.contains("client") {
171            score += 40;
172         }
173         if lower.contains("deprecated") || lower.contains("removed") {
174            score += 35;
175         }
176
177         // Add length component (capped contribution to avoid favoring verbosity)
178         score += (detail.len() / 20).min(10) as i32;
179
180         let tokens = estimate_tokens(detail);
181         (idx, score, tokens, detail)
182      })
183      .collect();
184
185   // Sort by score descending
186   scored.sort_by_key(|item| std::cmp::Reverse(item.1));
187
188   // Keep details until budget exhausted
189   let mut budget_remaining = max_tokens;
190   let mut keep_indices: Vec<usize> = Vec::new();
191
192   for (idx, _score, tokens, _detail) in scored {
193      if tokens <= budget_remaining {
194         keep_indices.push(idx);
195         budget_remaining -= tokens;
196      }
197   }
198
199   keep_indices.sort_unstable(); // Preserve original order
200
201   // Filter details
202   let kept: Vec<String> = keep_indices
203      .iter()
204      .filter_map(|&idx| details.get(idx).cloned())
205      .collect();
206   *details = kept;
207}
208
209/// Convert present-tense verbs to past-tense and handle type-specific
210/// replacements. Uses the shared [`crate::validation::PAST_TENSE_MAP`] so
211/// normalization and validation stay in sync.
212pub fn normalize_summary_verb(summary: &mut String, commit_type: &str) {
213   use crate::validation::{present_to_past, split_verb_token, verb_stem};
214
215   if summary.trim().is_empty() {
216      return;
217   }
218
219   let mut parts_iter = summary.split_whitespace();
220   let first_word = match parts_iter.next() {
221      Some(word) => word.to_string(),
222      None => return,
223   };
224   let rest = parts_iter.collect::<Vec<_>>().join(" ");
225   let first_word_lower = first_word.to_lowercase();
226
227   // Check if already past tense (full token, e.g. `re-enabled`).
228   if is_past_tense_verb(&first_word_lower) {
229      // Special case: refactor type shouldn't use "refactored"
230      if commit_type == "refactor" && first_word_lower == "refactored" {
231         *summary = if rest.is_empty() {
232            "restructured".to_string()
233         } else {
234            format!("restructured {rest}")
235         };
236      }
237      return;
238   }
239
240   // Skip tokens that aren't convertible verbs:
241   //  - all-caps acronyms (API, NFC, LSP)
242   //  - numeric-led tokens (403, v1.0, 2.0.0)
243   //  - tokens whose suffix isn't a simple dash/slash separator (e.g.
244   //    `fix(tui):` is a leaked type prefix, not a verb to convert)
245   let Some((stem_raw, suffix)) = split_verb_token(&first_word) else {
246      return;
247   };
248   let stem = stem_raw.to_ascii_lowercase();
249
250   // Skip all-caps acronyms: stem would be all uppercase, and verb_stem
251   // already returns None for those.
252   if verb_stem(&first_word).is_none() {
253      return;
254   }
255
256   // Only reattach suffix for simple separators (`-`, `/`). A suffix like
257   // `(tui):` means the model leaked the conventional prefix; leave it alone
258   // (strip_type_prefix handles that elsewhere).
259   let safe_suffix = if suffix.is_empty() || suffix.starts_with('-') || suffix.starts_with('/') {
260      suffix
261   } else {
262      // Non-separator suffix (e.g. `(tui):`): don't convert, to avoid
263      // producing `fixed(tui):`.
264      return;
265   };
266
267   // Handle `re-` prefixed verbs: `split_verb_token("re-enable")` gives
268   // stem="re", suffix="-enable". Detect that, parse the verb segment from
269   // the suffix, convert it, and rebuild as `re-{past}{tail}`.
270   if stem == "re" && safe_suffix.starts_with('-') {
271      let after_dash = &safe_suffix[1..]; // skip '-'
272      let next_n = after_dash
273         .bytes()
274         .take_while(|&b| b.is_ascii_alphabetic())
275         .count();
276      if next_n > 0 {
277         let inner = after_dash[..next_n].to_ascii_lowercase();
278         let tail = &after_dash[next_n..]; // e.g. "" or "-checking"
279
280         let inner_past = present_to_past(&inner)
281            .or_else(|| inner.strip_suffix('s').and_then(|s| present_to_past(s)))
282            .or_else(|| inner.strip_suffix("es").and_then(|s| present_to_past(s)))
283            .or_else(|| {
284               inner.strip_suffix("ies").and_then(|s| present_to_past(&format!("{s}y")))
285            })
286            .map(|p| {
287               if commit_type == "refactor" && p == "refactored" {
288                  "restructured"
289               } else {
290                  p
291               }
292            });
293
294         if let Some(past) = inner_past {
295            *summary = if rest.is_empty() {
296               format!("re-{past}{tail}")
297            } else {
298               format!("re-{past}{tail} {rest}")
299            };
300         }
301      }
302      return;
303   }
304
305   // Normal case: look up the stem directly.
306   let past = present_to_past(&stem)
307      .or_else(|| {
308         // Strip trailing 's' for third-person present (adds -> add).
309         stem.strip_suffix('s').and_then(|s| present_to_past(s))
310      })
311      .or_else(|| {
312         // Strip trailing 'es' for verbs ending in s/sh/ch/x/z (fixes -> fix).
313         stem.strip_suffix("es").and_then(|s| present_to_past(s))
314      })
315      .or_else(|| {
316         // -ies -> -y (simplifies -> simplify, applies -> apply).
317         stem.strip_suffix("ies").and_then(|s| {
318            present_to_past(&format!("{s}y"))
319         })
320      })
321      .map(|p| {
322         // Special case: refactor type shouldn't use "refactored"
323         if commit_type == "refactor" && p == "refactored" {
324            "restructured"
325         } else {
326            p
327         }
328      });
329
330   if let Some(past) = past {
331      *summary = if rest.is_empty() {
332         format!("{past}{safe_suffix}")
333      } else {
334         format!("{past}{safe_suffix} {rest}")
335      };
336   }
337}
338
339/// Post-process conventional commit message to fix common issues
340pub fn post_process_commit_message(msg: &mut ConventionalCommit, config: &CommitConfig) {
341   // CommitType and Scope are already normalized to lowercase in their
342   // constructors No need to re-normalize them here
343
344   // Extract summary string for mutations, will reconstruct at end
345   let mut summary_str = normalize_unicode(msg.summary.as_str());
346
347   // Normalize body and footers
348   msg.body = msg.body.iter().map(|s| normalize_unicode(s)).collect();
349   msg.footers = msg.footers.iter().map(|s| normalize_unicode(s)).collect();
350
351   // Normalize summary formatting: single line, trimmed, enforce trailing period
352   summary_str = summary_str
353      .replace(['\r', '\n'], " ")
354      .split_whitespace()
355      .collect::<Vec<_>>()
356      .join(" ")
357      .trim()
358      .trim_end_matches('.')
359      .trim_end_matches(';')
360      .trim_end_matches(':')
361      .to_string();
362
363   // Helper: check if first token is all caps (acronym/initialism)
364   let is_first_token_all_caps = |s: &str| -> bool {
365      s.split_whitespace().next().is_some_and(|token| {
366         token
367            .chars()
368            .all(|c| !c.is_alphabetic() || c.is_uppercase())
369      })
370   };
371
372   // Ensure summary starts with lowercase (unless first token is all caps)
373   if !is_first_token_all_caps(&summary_str)
374      && let Some(first_char) = summary_str.chars().next()
375      && first_char.is_uppercase()
376   {
377      let rest = &summary_str[first_char.len_utf8()..];
378      summary_str = format!("{}{}", first_char.to_lowercase(), rest);
379   }
380
381   // Normalize verb tense (present \u{2192} past, handle type-specific
382   // replacements)
383   normalize_summary_verb(&mut summary_str, msg.commit_type.as_str());
384   summary_str = summary_str.trim().to_string();
385
386   // Ensure lowercase after normalization (unless first token is all caps)
387   if !is_first_token_all_caps(&summary_str)
388      && let Some(first_char) = summary_str.chars().next()
389      && first_char.is_uppercase()
390   {
391      let rest = &summary_str[first_char.len_utf8()..];
392      summary_str = format!("{}{}", first_char.to_lowercase(), rest);
393   }
394
395   // No truncation - validation handles length checks
396   // Remove any trailing period (conventional commits don't use periods)
397   summary_str = summary_str.trim_end_matches('.').to_string();
398
399   // Reconstruct CommitSummary (bypassing warnings since post-processing
400   // normalizes)
401   msg.summary = crate::types::CommitSummary::new_unchecked(summary_str, 128)
402      .expect("post-processed summary should be valid");
403
404   // Clean and enforce punctuation for body items
405   for item in &mut msg.body {
406      let mut cleaned = item
407         .replace(['\r', '\n'], " ")
408         .trim()
409         .trim_start_matches('\u{2022}')
410         .trim_start_matches('-')
411         .trim_start_matches('*')
412         .trim_start_matches('+')
413         .trim()
414         .to_string();
415
416      cleaned = cleaned
417         .split_whitespace()
418         .collect::<Vec<_>>()
419         .join(" ")
420         .trim()
421         .trim_end_matches('.')
422         .trim_end_matches(';')
423         .trim_end_matches(',')
424         .to_string();
425
426      if cleaned.is_empty() {
427         *item = cleaned;
428         continue;
429      }
430
431      // Capitalize first letter
432      if let Some(first_char) = cleaned.chars().next()
433         && first_char.is_lowercase()
434      {
435         let rest = &cleaned[first_char.len_utf8()..];
436         cleaned = format!("{}{}", first_char.to_uppercase(), rest);
437      }
438
439      if !cleaned.ends_with('.') {
440         cleaned.push('.');
441      }
442
443      *item = cleaned;
444   }
445
446   // Remove empty body items
447   msg.body.retain(|item| !item.trim().is_empty());
448
449   // Cap details by token budget
450   cap_details(&mut msg.body, config.max_detail_tokens);
451}
452
453/// Format `ConventionalCommit` as a single string for display and commit
454pub fn format_commit_message(msg: &ConventionalCommit) -> String {
455   // Build first line: type(scope): summary
456   let scope_part = msg
457      .scope
458      .as_ref()
459      .map(|s| format!("({s})"))
460      .unwrap_or_default();
461   let first_line = format!("{}{}: {}", msg.commit_type, scope_part, msg.summary);
462
463   // Build body with - bullets
464   let body_formatted = if msg.body.is_empty() {
465      String::new()
466   } else {
467      msg.body
468         .iter()
469         .map(|item| format!("- {item}"))
470         .collect::<Vec<_>>()
471         .join("\n")
472   };
473
474   // Build footers
475   let footers_formatted = if msg.footers.is_empty() {
476      String::new()
477   } else {
478      msg.footers.join("\n")
479   };
480
481   // Combine parts
482   let mut result = first_line;
483   if !body_formatted.is_empty() {
484      result.push_str("\n\n");
485      result.push_str(&body_formatted);
486   }
487   if !footers_formatted.is_empty() {
488      result.push_str("\n\n");
489      result.push_str(&footers_formatted);
490   }
491   result
492}
493
494#[cfg(test)]
495mod tests {
496   use super::*;
497   use crate::types::{CommitSummary, CommitType, ConventionalCommit, Scope};
498
499   // normalize_unicode tests
500   #[test]
501   fn test_normalize_unicode_smart_quotes() {
502      assert_eq!(normalize_unicode("\u{2018}smart quotes\u{2019}"), "'smart quotes'");
503      assert_eq!(normalize_unicode("\u{201C}double quotes\u{201D}"), "\"double quotes\"");
504      assert_eq!(normalize_unicode("\u{201A}low quote\u{2019}"), "'low quote'");
505      assert_eq!(normalize_unicode("\u{201E}low double\u{201D}"), "\"low double\"");
506   }
507
508   #[test]
509   fn test_normalize_unicode_dashes() {
510      assert_eq!(normalize_unicode("en\u{2013}dash"), "en--dash");
511      assert_eq!(normalize_unicode("em\u{2014}dash"), "em--dash");
512      assert_eq!(normalize_unicode("fig\u{2012}dash"), "fig-dash");
513      assert_eq!(normalize_unicode("minus\u{2212}sign"), "minus-sign");
514   }
515
516   #[test]
517   fn test_normalize_unicode_arrows() {
518      assert_eq!(normalize_unicode("arrow\u{2192}right"), "arrow->right");
519      assert_eq!(normalize_unicode("arrow\u{2190}left"), "arrow<-left");
520      assert_eq!(normalize_unicode("arrow\u{2194}both"), "arrow<->both");
521      assert_eq!(normalize_unicode("double\u{21D2}arrow"), "double=>arrow");
522      assert_eq!(normalize_unicode("up\u{2191}arrow"), "up^arrow");
523   }
524
525   #[test]
526   fn test_normalize_unicode_math() {
527      assert_eq!(normalize_unicode("a\u{00D7}b"), "axb");
528      assert_eq!(normalize_unicode("a\u{00F7}b"), "a/b");
529      assert_eq!(normalize_unicode("x\u{2264}y"), "x<=y");
530      assert_eq!(normalize_unicode("x\u{2265}y"), "x>=y");
531      assert_eq!(normalize_unicode("x\u{2260}y"), "x!=y");
532      assert_eq!(normalize_unicode("x\u{2248}y"), "x~=y");
533   }
534
535   #[test]
536   fn test_normalize_unicode_greek() {
537      assert_eq!(normalize_unicode("\u{03BB} function"), "lambda function");
538      assert_eq!(normalize_unicode("\u{03B1} beta \u{03B3}"), "alpha beta gamma");
539      assert_eq!(normalize_unicode("\u{03BC} service"), "mu service");
540      assert_eq!(normalize_unicode("\u{03A3} total"), "Sigma total");
541   }
542
543   #[test]
544   fn test_normalize_unicode_fractions() {
545      assert_eq!(normalize_unicode("\u{00BD} cup"), "1/2 cup");
546      assert_eq!(normalize_unicode("\u{00BE} done"), "3/4 done");
547      assert_eq!(normalize_unicode("\u{2153} left"), "1/3 left");
548   }
549
550   #[test]
551   fn test_normalize_unicode_superscripts() {
552      assert_eq!(normalize_unicode("x\u{00B2}"), "x^2");
553      assert_eq!(normalize_unicode("10\u{00B3}"), "10^3");
554   }
555
556   #[test]
557   fn test_normalize_unicode_multiple_replacements() {
558      let input =
559         "\u{2018}smart\u{2019}\u{2192}straight \u{201C}quotes\u{201D}\u{00D7}math\u{2264}ops";
560      let expected = "'smart'->straight \"quotes\"xmath<=ops";
561      assert_eq!(normalize_unicode(input), expected);
562   }
563
564   #[test]
565   fn test_normalize_unicode_ellipsis() {
566      assert_eq!(normalize_unicode("wait\u{2026}"), "wait...");
567      assert_eq!(normalize_unicode("more\u{22EF}dots"), "more...dots");
568   }
569
570   #[test]
571   fn test_normalize_unicode_bullets() {
572      assert_eq!(normalize_unicode("\u{2022}item"), "-item");
573      assert_eq!(normalize_unicode("\u{25E6}item"), "-item");
574   }
575
576   #[test]
577   fn test_normalize_unicode_check_marks() {
578      assert_eq!(normalize_unicode("\u{2713}done"), "vdone");
579      assert_eq!(normalize_unicode("\u{2717}failed"), "xfailed");
580   }
581
582   // normalize_summary_verb tests
583   #[test]
584   fn test_normalize_summary_verb_present_to_past() {
585      let mut s = "add new feature".to_string();
586      normalize_summary_verb(&mut s, "feat");
587      assert_eq!(s, "added new feature");
588
589      let mut s = "fix bug".to_string();
590      normalize_summary_verb(&mut s, "fix");
591      assert_eq!(s, "fixed bug");
592
593      let mut s = "update docs".to_string();
594      normalize_summary_verb(&mut s, "docs");
595      assert_eq!(s, "updated docs");
596   }
597
598   #[test]
599   fn test_normalize_summary_verb_already_past() {
600      let mut s = "added feature".to_string();
601      normalize_summary_verb(&mut s, "feat");
602      assert_eq!(s, "added feature");
603
604      let mut s = "fixed bug".to_string();
605      normalize_summary_verb(&mut s, "fix");
606      assert_eq!(s, "fixed bug");
607   }
608
609   #[test]
610   fn test_normalize_summary_verb_third_person() {
611      let mut s = "adds feature".to_string();
612      normalize_summary_verb(&mut s, "feat");
613      assert_eq!(s, "added feature");
614
615      let mut s = "fixes bug".to_string();
616      normalize_summary_verb(&mut s, "fix");
617      assert_eq!(s, "fixed bug");
618   }
619
620   #[test]
621   fn test_normalize_summary_verb_non_verb_start() {
622      let mut s = "123 files changed".to_string();
623      normalize_summary_verb(&mut s, "chore");
624      assert_eq!(s, "123 files changed");
625   }
626
627   #[test]
628   fn test_normalize_summary_verb_refactor_special_case() {
629      let mut s = "refactored code".to_string();
630      normalize_summary_verb(&mut s, "refactor");
631      assert_eq!(s, "restructured code");
632   }
633
634   #[test]
635   fn test_normalize_summary_verb_refactor_present() {
636      let mut s = "refactor code".to_string();
637      normalize_summary_verb(&mut s, "refactor");
638      assert_eq!(s, "restructured code");
639
640      let mut s = "refactor logic".to_string();
641      normalize_summary_verb(&mut s, "feat");
642      assert_eq!(s, "refactored logic");
643   }
644
645   #[test]
646   fn test_normalize_summary_verb_empty() {
647      let mut s = String::new();
648      normalize_summary_verb(&mut s, "feat");
649      assert_eq!(s, "");
650   }
651
652   #[test]
653   fn test_normalize_summary_verb_single_word() {
654      let mut s = "add".to_string();
655      normalize_summary_verb(&mut s, "feat");
656      assert_eq!(s, "added");
657   }
658
659   #[test]
660   fn test_normalize_summary_verb_harden_to_hardened() {
661      let mut s = "harden stealth scripts against detection".to_string();
662      normalize_summary_verb(&mut s, "fix");
663      assert_eq!(s, "hardened stealth scripts against detection");
664   }
665
666   #[test]
667   fn test_normalize_summary_verb_bind_to_bound() {
668      let mut s = "bind native methods to local constants".to_string();
669      normalize_summary_verb(&mut s, "fix");
670      assert_eq!(s, "bound native methods to local constants");
671   }
672
673   #[test]
674   fn test_normalize_summary_verb_third_person_ies() {
675      // -ies -> -y conversion (simplifies -> simplify -> simplified)
676      let mut s = "simplifies the config loading".to_string();
677      normalize_summary_verb(&mut s, "refactor");
678      assert_eq!(s, "simplified the config loading");
679   }
680
681   #[test]
682   fn test_normalize_summary_verb_third_person_es() {
683      // -es stripping (fixes -> fix -> fixed)
684      let mut s = "fixes race condition".to_string();
685      normalize_summary_verb(&mut s, "fix");
686      assert_eq!(s, "fixed race condition");
687   }
688
689   #[test]
690   fn test_normalize_summary_verb_suffix_reattach_dash() {
691      // Dash suffix should be reattached after conversion
692      let mut s = "isolate-subagent from main flow".to_string();
693      normalize_summary_verb(&mut s, "refactor");
694      assert_eq!(s, "isolated-subagent from main flow");
695   }
696
697   #[test]
698   fn test_normalize_summary_verb_skip_type_prefix_leak() {
699      // `fix(tui):` is a leaked conventional prefix, NOT a verb to convert.
700      // The `(tui):` suffix is not a dash/slash separator, so we skip.
701      let mut s = "fix(tui): rendering bug".to_string();
702      normalize_summary_verb(&mut s, "fix");
703      assert_eq!(s, "fix(tui): rendering bug");
704   }
705
706   #[test]
707   fn test_normalize_summary_verb_skip_acronym() {
708      // All-caps acronyms should not be converted
709      let mut s = "API response handling".to_string();
710      normalize_summary_verb(&mut s, "feat");
711      assert_eq!(s, "API response handling");
712   }
713
714   #[test]
715   fn test_normalize_summary_verb_skip_numeric() {
716      // Numeric-led tokens should not be converted
717      let mut s = "403 error handling".to_string();
718      normalize_summary_verb(&mut s, "fix");
719      assert_eq!(s, "403 error handling");
720   }
721
722   #[test]
723   fn test_normalize_summary_verb_already_past_hardened() {
724      // Already past tense should not be re-converted
725      let mut s = "hardened stealth scripts".to_string();
726      normalize_summary_verb(&mut s, "fix");
727      assert_eq!(s, "hardened stealth scripts");
728   }
729
730   #[test]
731   fn test_normalize_summary_verb_already_past_bound() {
732      let mut s = "bound native methods".to_string();
733      normalize_summary_verb(&mut s, "fix");
734      assert_eq!(s, "bound native methods");
735   }
736
737   #[test]
738   fn test_normalize_summary_verb_preserves_existing_third_person() {
739      // The old test had "adds" and "fixes" - ensure they still work
740      let mut s = "adds feature".to_string();
741      normalize_summary_verb(&mut s, "feat");
742      assert_eq!(s, "added feature");
743
744      let mut s = "fixes bug".to_string();
745      normalize_summary_verb(&mut s, "fix");
746      assert_eq!(s, "fixed bug");
747
748      let mut s = "updates docs".to_string();
749      normalize_summary_verb(&mut s, "docs");
750      assert_eq!(s, "updated docs");
751   }
752
753   #[test]
754   fn test_normalize_summary_verb_re_prefix_enable() {
755      let mut s = "re-enable formatting checks".to_string();
756      normalize_summary_verb(&mut s, "fix");
757      assert_eq!(s, "re-enabled formatting checks");
758   }
759
760   #[test]
761   fn test_normalize_summary_verb_re_prefix_run() {
762      let mut s = "re-run the test suite".to_string();
763      normalize_summary_verb(&mut s, "fix");
764      assert_eq!(s, "re-ran the test suite");
765   }
766
767   #[test]
768   fn test_normalize_summary_verb_re_prefix_with_tail() {
769      // re-format-checking -> re-formatted-checking
770      let mut s = "re-format-checking pipeline".to_string();
771      normalize_summary_verb(&mut s, "fix");
772      assert_eq!(s, "re-formatted-checking pipeline");
773   }
774
775   #[test]
776   fn test_normalize_summary_verb_re_prefix_already_past() {
777      // re-enabled is already past tense, should not be re-converted
778      let mut s = "re-enabled linting".to_string();
779      normalize_summary_verb(&mut s, "fix");
780      assert_eq!(s, "re-enabled linting");
781   }
782
783   // cap_details tests (budget-based)
784   #[test]
785   fn test_cap_details_under_budget() {
786      let mut details = vec!["first".to_string(), "second".to_string(), "third".to_string()];
787      let tokens: usize = details.iter().map(|d| estimate_tokens(d)).sum();
788      cap_details(&mut details, tokens + 100);
789      assert_eq!(details.len(), 3);
790   }
791
792   #[test]
793   fn test_cap_details_at_budget() {
794      let mut details = vec![
795         "one".to_string(),
796         "two".to_string(),
797         "three".to_string(),
798         "four".to_string(),
799         "five".to_string(),
800         "six".to_string(),
801      ];
802      let tokens: usize = details.iter().map(|d| estimate_tokens(d)).sum();
803      cap_details(&mut details, tokens);
804      assert_eq!(details.len(), 6);
805   }
806
807   #[test]
808   fn test_cap_details_security_priority() {
809      let mut details = vec![
810         "normal change".to_string(),
811         "security vulnerability fixed".to_string(),
812         "another change".to_string(),
813         "third change".to_string(),
814         "fourth change".to_string(),
815         "fifth change".to_string(),
816         "sixth change".to_string(),
817      ];
818      // Budget for ~4 typical items (15 chars each = ~4 tokens, 4*4 = 16 tokens)
819      cap_details(&mut details, 60);
820      assert!(details.iter().any(|d| d.contains("security")));
821   }
822
823   #[test]
824   fn test_cap_details_performance_priority() {
825      let mut details = vec![
826         "normal change".to_string(),
827         "performance optimization added".to_string(),
828         "another change".to_string(),
829         "third change".to_string(),
830         "fourth change".to_string(),
831         "fifth change".to_string(),
832      ];
833      // Budget for ~3 typical items
834      cap_details(&mut details, 40);
835      assert!(details.iter().any(|d| d.contains("performance")));
836   }
837
838   #[test]
839   fn test_cap_details_api_priority() {
840      let mut details = vec![
841         "normal change".to_string(),
842         "API interface updated".to_string(),
843         "internal change".to_string(),
844         "another internal change".to_string(),
845         "yet another change".to_string(),
846      ];
847      // Budget for ~3 items
848      cap_details(&mut details, 50);
849      assert!(details.iter().any(|d| d.contains("API")));
850   }
851
852   #[test]
853   fn test_cap_details_preserves_order() {
854      let mut details = vec![
855         "first".to_string(),
856         "critical security fix".to_string(),
857         "third".to_string(),
858         "performance improvement".to_string(),
859         "fifth".to_string(),
860      ];
861      // Budget for ~3 items
862      cap_details(&mut details, 50);
863      // Should preserve relative order of kept items
864      let security_idx = details.iter().position(|d| d.contains("security"));
865      let perf_idx = details.iter().position(|d| d.contains("performance"));
866      assert!(security_idx.unwrap() < perf_idx.unwrap());
867   }
868
869   #[test]
870   fn test_cap_details_empty_list() {
871      let mut details: Vec<String> = vec![];
872      cap_details(&mut details, 100);
873      assert_eq!(details.len(), 0);
874   }
875
876   #[test]
877   fn test_cap_details_breaking_priority() {
878      let mut details = vec![
879         "normal change".to_string(),
880         "breaking change introduced".to_string(),
881         "another change".to_string(),
882         "third change".to_string(),
883         "fourth change".to_string(),
884      ];
885      // Budget for ~3 items
886      cap_details(&mut details, 50);
887      assert!(details.iter().any(|d| d.contains("breaking")));
888   }
889
890   #[test]
891   fn test_cap_details_budget_prefers_short_high_priority() {
892      // 6 short high-priority items should fit, but 2 long low-priority shouldn't
893      let mut details = vec![
894         "security fix".to_string(),     // ~12 chars, ~3 tokens, score 100
895         "bug fix".to_string(),          // ~7 chars, ~2 tokens, score 70
896         "API change".to_string(),       // ~10 chars, ~3 tokens, score 50
897         "performance gain".to_string(), // ~16 chars, ~4 tokens, score 80
898         "breaking change".to_string(),  // ~15 chars, ~4 tokens, score 90
899         "user feature".to_string(),     // ~12 chars, ~3 tokens, score 40
900         "This is a very long internal refactoring detail that adds no user value".to_string(), /* ~73 chars, ~19 tokens, score 0 */
901         "Another extremely long low priority change description here".to_string(), /* ~61 chars, ~16 tokens, score 0 */
902      ];
903      // Budget: 30 tokens (enough for all 6 short items, not enough for long ones)
904      cap_details(&mut details, 30);
905      // Should keep short high-priority items
906      assert!(details.iter().any(|d| d.contains("security")));
907      assert!(details.iter().any(|d| d.contains("breaking")));
908      // Should drop long low-priority items
909      assert!(!details.iter().any(|d| d.contains("very long internal")));
910   }
911
912   #[test]
913   fn test_cap_details_budget_allows_variable_count() {
914      // With same budget, should fit more short items or fewer long items
915      let short_details = vec![
916         "fix A".to_string(),
917         "fix B".to_string(),
918         "fix C".to_string(),
919         "fix D".to_string(),
920         "fix E".to_string(),
921         "fix F".to_string(),
922      ];
923      let long_details = vec![
924         "Fixed a critical security vulnerability in authentication".to_string(),
925         "Implemented comprehensive performance optimization".to_string(),
926         "Added extensive API documentation and examples".to_string(),
927      ];
928
929      let mut short = short_details;
930      let mut long = long_details;
931
932      cap_details(&mut short, 50); // Should fit all 6 short items (~2 tokens each)
933      cap_details(&mut long, 50); // Should fit only 2-3 long items (~13-15 tokens each)
934
935      assert!(short.len() >= 5); // Most short items fit
936      assert!(long.len() <= 3); // Fewer long items fit
937   }
938
939   // format_commit_message tests
940   #[test]
941   fn test_format_commit_message_type_summary_only() {
942      let commit = ConventionalCommit {
943         commit_type: CommitType::new("feat").unwrap(),
944         scope:       None,
945         summary:     CommitSummary::new_unchecked("added new feature", 128).unwrap(),
946         body:        vec![],
947         footers:     vec![],
948      };
949      assert_eq!(format_commit_message(&commit), "feat: added new feature");
950   }
951
952   #[test]
953   fn test_format_commit_message_with_scope() {
954      let commit = ConventionalCommit {
955         commit_type: CommitType::new("fix").unwrap(),
956         scope:       Some(Scope::new("api").unwrap()),
957         summary:     CommitSummary::new_unchecked("fixed bug", 128).unwrap(),
958         body:        vec![],
959         footers:     vec![],
960      };
961      assert_eq!(format_commit_message(&commit), "fix(api): fixed bug");
962   }
963
964   #[test]
965   fn test_format_commit_message_with_body() {
966      let commit = ConventionalCommit {
967         commit_type: CommitType::new("feat").unwrap(),
968         scope:       None,
969         summary:     CommitSummary::new_unchecked("added feature", 128).unwrap(),
970         body:        vec!["First detail.".to_string(), "Second detail.".to_string()],
971         footers:     vec![],
972      };
973      let expected = "feat: added feature\n\n- First detail.\n- Second detail.";
974      assert_eq!(format_commit_message(&commit), expected);
975   }
976
977   #[test]
978   fn test_format_commit_message_with_footers() {
979      let commit = ConventionalCommit {
980         commit_type: CommitType::new("fix").unwrap(),
981         scope:       None,
982         summary:     CommitSummary::new_unchecked("fixed bug", 128).unwrap(),
983         body:        vec![],
984         footers:     vec!["Closes: #123".to_string(), "Fixes: #456".to_string()],
985      };
986      let expected = "fix: fixed bug\n\nCloses: #123\nFixes: #456";
987      assert_eq!(format_commit_message(&commit), expected);
988   }
989
990   #[test]
991   fn test_format_commit_message_full() {
992      let commit = ConventionalCommit {
993         commit_type: CommitType::new("feat").unwrap(),
994         scope:       Some(Scope::new("auth").unwrap()),
995         summary:     CommitSummary::new_unchecked("added oauth support", 128).unwrap(),
996         body:        vec![
997            "Implemented OAuth2 flow.".to_string(),
998            "Added token refresh.".to_string(),
999         ],
1000         footers:     vec!["Closes: #789".to_string()],
1001      };
1002      let expected = "feat(auth): added oauth support\n\n- Implemented OAuth2 flow.\n- Added \
1003                      token refresh.\n\nCloses: #789";
1004      assert_eq!(format_commit_message(&commit), expected);
1005   }
1006
1007   #[test]
1008   fn test_format_commit_message_nested_scope() {
1009      let commit = ConventionalCommit {
1010         commit_type: CommitType::new("refactor").unwrap(),
1011         scope:       Some(Scope::new("api/client").unwrap()),
1012         summary:     CommitSummary::new_unchecked("restructured code", 128).unwrap(),
1013         body:        vec![],
1014         footers:     vec![],
1015      };
1016      assert_eq!(format_commit_message(&commit), "refactor(api/client): restructured code");
1017   }
1018}
llm_git/normalization.rs

llm_git/
normalization.rs