llm_git/
normalization.rs

1/// Normalization utilities for commit messages
2use unicode_normalization::UnicodeNormalization;
3
4use crate::{config::CommitConfig, types::ConventionalCommit, validation::is_past_tense_verb};
5
6/// Normalize Unicode characters to ASCII (remove AI-style formatting)
7/// Normalize Unicode characters to ASCII (remove AI-style formatting)
8pub fn normalize_unicode(text: &str) -> String {
9   // Pre-NFKD replacements for chars that decompose badly
10   // (≠ → = + combining, ½ → 1⁄2, ² → 2)
11   let pre_normalized = text
12      // Math symbols that decompose badly
13      .replace('≠', "!=") // not equal to (decomposes to = + \u{338})
14      // Fractions (NFKD decomposes ½ to 1⁄2 with fraction slash, not regular /)
15      .replace('½', "1/2")
16      .replace('¼', "1/4")
17      .replace('¾', "3/4")
18      .replace('⅓', "1/3")
19      .replace('⅔', "2/3")
20      .replace('⅕', "1/5")
21      .replace('⅖', "2/5")
22      .replace('⅗', "3/5")
23      .replace('⅘', "4/5")
24      .replace('⅙', "1/6")
25      .replace('⅚', "5/6")
26      .replace('⅛', "1/8")
27      .replace('⅜', "3/8")
28      .replace('⅝', "5/8")
29      .replace('⅞', "7/8")
30      // Superscripts (NFKD decomposes ² to just "2", losing the superscript meaning)
31      .replace('⁰', "^0")
32      .replace('¹', "^1")
33      .replace('²', "^2")
34      .replace('³', "^3")
35      .replace('⁴', "^4")
36      .replace('⁵', "^5")
37      .replace('⁶', "^6")
38      .replace('⁷', "^7")
39      .replace('⁸', "^8")
40      .replace('⁹', "^9")
41      // Subscripts
42      .replace('₀', "_0")
43      .replace('₁', "_1")
44      .replace('₂', "_2")
45      .replace('₃', "_3")
46      .replace('₄', "_4")
47      .replace('₅', "_5")
48      .replace('₆', "_6")
49      .replace('₇', "_7")
50      .replace('₈', "_8")
51      .replace('₉', "_9");
52
53   // Apply NFKD normalization for canonical decomposition
54   let normalized: String = pre_normalized.nfkd().collect();
55
56   normalized
57      // Smart quotes to straight quotes
58      .replace(['\u{2018}', '\u{2019}'], "'") // ' right single quote / apostrophe
59      .replace(['\u{201C}', '\u{201D}'], "\"") // " right double quote
60      .replace('\u{201A}', "'") // ‚ single low-9 quote
61      .replace(['\u{201E}', '\u{00AB}', '\u{00BB}'], "\"") // » right-pointing double angle quote
62      .replace(['\u{2039}', '\u{203A}'], "'") // › single right-pointing angle quote
63      // Dashes and hyphens
64      .replace(['\u{2010}', '\u{2011}', '\u{2012}'], "-") // ‒ figure dash
65      .replace(['\u{2013}', '\u{2014}', '\u{2015}'], "--") // ― horizontal bar
66      .replace('\u{2212}', "-") // − minus sign
67      // Arrows
68      .replace('\u{2192}', "->") // rightwards arrow
69      .replace('←', "<-") // leftwards arrow
70      .replace('↔', "<->") // left right arrow
71      .replace('⇒', "=>") // rightwards double arrow
72      .replace('⇐', "<=") // leftwards double arrow
73      .replace('⇔', "<=>") // left right double arrow
74      .replace('↑', "^") // upwards arrow
75      .replace('↓', "v") // downwards arrow
76      // Math symbols
77      .replace('\u{2264}', "<=") // less than or equal to
78      .replace('≥', ">=") // greater than or equal to
79      .replace('≈', "~=") // approximately equal to
80      .replace('≡', "==") // identical to
81      .replace('\u{00D7}', "x") // multiplication sign
82      .replace('÷', "/") // division sign
83      // Ellipsis
84      .replace(['\u{2026}', '⋯', '⋮'], "...") // vertical ellipsis
85      // Bullet points (convert to hyphens for consistency)
86      .replace(['•', '◦', '▪', '▫', '◆', '◇'], "-") // white diamond
87      // Check marks
88      .replace(['✓', '✔'], "v") // heavy check mark
89      .replace(['✗', '✘'], "x") // heavy ballot x
90      // Greek letters (common in programming)
91      .replace('λ', "lambda")
92      .replace('α', "alpha")
93      .replace('β', "beta")
94      .replace('γ', "gamma")
95      .replace('δ', "delta")
96      .replace('ε', "epsilon")
97      .replace('θ', "theta")
98      .replace('μ', "mu")
99      .replace('π', "pi")
100      .replace('σ', "sigma")
101      .replace('Σ', "Sigma")
102      .replace('Δ', "Delta")
103      .replace('Π', "Pi")
104      // Special spaces to regular space
105      .replace(
106         [
107            '\u{00A0}', '\u{2000}', '\u{2001}', '\u{2002}', '\u{2003}', '\u{2004}', '\u{2005}',
108            '\u{2006}', '\u{2007}', '\u{2008}', '\u{2009}', '\u{200A}', '\u{202F}', '\u{205F}',
109            '\u{3000}',
110         ],
111         " ",
112      ) // ideographic space
113      // Zero-width characters (remove)
114      .replace(['\u{200B}', '\u{200C}', '\u{200D}', '\u{FEFF}'], "") // zero-width no-break space (BOM)
115}
116
117/// Estimate token count for text (rough approximation: 1 token ≈ 4 chars)
118const fn estimate_tokens(text: &str) -> usize {
119   text.len().div_ceil(4) // Round up
120}
121
122/// Cap detail points by token budget instead of hard count
123/// Keeps high-priority details until budget exhausted
124pub fn cap_details(details: &mut Vec<String>, max_tokens: usize) {
125   if details.is_empty() {
126      return;
127   }
128
129   // Calculate total tokens
130   let total_tokens: usize = details.iter().map(|d| estimate_tokens(d)).sum();
131
132   if total_tokens <= max_tokens {
133      return; // Under budget, keep all
134   }
135
136   // Score by priority keywords and length
137   let mut scored: Vec<(usize, i32, usize, &String)> = details
138      .iter()
139      .enumerate()
140      .map(|(idx, detail)| {
141         let lower = detail.to_lowercase();
142         let mut score = 0;
143
144         // High priority keywords (security, crashes, critical bugs)
145         if lower.contains("security")
146            || lower.contains("vulnerability")
147            || lower.contains("exploit")
148            || lower.contains("critical")
149            || (lower.contains("fix") && lower.contains("crash"))
150         {
151            score += 100;
152         }
153         if lower.contains("breaking") || lower.contains("incompatible") {
154            score += 90;
155         }
156         if lower.contains("performance")
157            || lower.contains("faster")
158            || lower.contains("optimization")
159         {
160            score += 80;
161         }
162         if lower.contains("fix") || lower.contains("bug") {
163            score += 70;
164         }
165
166         // Medium priority keywords
167         if lower.contains("api") || lower.contains("interface") || lower.contains("public") {
168            score += 50;
169         }
170         if lower.contains("user") || lower.contains("client") {
171            score += 40;
172         }
173         if lower.contains("deprecated") || lower.contains("removed") {
174            score += 35;
175         }
176
177         // Add length component (capped contribution to avoid favoring verbosity)
178         score += (detail.len() / 20).min(10) as i32;
179
180         let tokens = estimate_tokens(detail);
181         (idx, score, tokens, detail)
182      })
183      .collect();
184
185   // Sort by score descending
186   scored.sort_by_key(|item| std::cmp::Reverse(item.1));
187
188   // Keep details until budget exhausted
189   let mut budget_remaining = max_tokens;
190   let mut keep_indices: Vec<usize> = Vec::new();
191
192   for (idx, _score, tokens, _detail) in scored {
193      if tokens <= budget_remaining {
194         keep_indices.push(idx);
195         budget_remaining -= tokens;
196      }
197   }
198
199   keep_indices.sort_unstable(); // Preserve original order
200
201   // Filter details
202   let kept: Vec<String> = keep_indices
203      .iter()
204      .filter_map(|&idx| details.get(idx).cloned())
205      .collect();
206   *details = kept;
207}
208
209/// Convert present-tense verbs to past-tense and handle type-specific
210/// replacements. Uses the shared [`crate::validation::PAST_TENSE_MAP`] so
211/// normalization and validation stay in sync.
212pub fn normalize_summary_verb(summary: &mut String, commit_type: &str) {
213   use crate::validation::{present_to_past, split_verb_token, verb_stem};
214
215   if summary.trim().is_empty() {
216      return;
217   }
218
219   let mut parts_iter = summary.split_whitespace();
220   let first_word = match parts_iter.next() {
221      Some(word) => word.to_string(),
222      None => return,
223   };
224   let rest = parts_iter.collect::<Vec<_>>().join(" ");
225   let first_word_lower = first_word.to_lowercase();
226
227   // Check if already past tense (full token, e.g. `re-enabled`).
228   if is_past_tense_verb(&first_word_lower) {
229      // Special case: refactor type shouldn't use "refactored"
230      if commit_type == "refactor" && first_word_lower == "refactored" {
231         *summary = if rest.is_empty() {
232            "restructured".to_string()
233         } else {
234            format!("restructured {rest}")
235         };
236      }
237      return;
238   }
239
240   // Skip tokens that aren't convertible verbs:
241   //  - all-caps acronyms (API, NFC, LSP)
242   //  - numeric-led tokens (403, v1.0, 2.0.0)
243   //  - tokens whose suffix isn't a simple dash/slash separator (e.g. `fix(tui):`
244   //    is a leaked type prefix, not a verb to convert)
245   let Some((stem_raw, suffix)) = split_verb_token(&first_word) else {
246      return;
247   };
248   let stem = stem_raw.to_ascii_lowercase();
249
250   // Skip all-caps acronyms: stem would be all uppercase, and verb_stem
251   // already returns None for those.
252   if verb_stem(&first_word).is_none() {
253      return;
254   }
255
256   // Only reattach suffix for simple separators (`-`, `/`). A suffix like
257   // `(tui):` means the model leaked the conventional prefix; leave it alone
258   // (strip_type_prefix handles that elsewhere).
259   let safe_suffix = if suffix.is_empty() || suffix.starts_with('-') || suffix.starts_with('/') {
260      suffix
261   } else {
262      // Non-separator suffix (e.g. `(tui):`): don't convert, to avoid
263      // producing `fixed(tui):`.
264      return;
265   };
266
267   // Handle `re-` prefixed verbs: `split_verb_token("re-enable")` gives
268   // stem="re", suffix="-enable". Detect that, parse the verb segment from
269   // the suffix, convert it, and rebuild as `re-{past}{tail}`.
270   if stem == "re" && safe_suffix.starts_with('-') {
271      let after_dash = &safe_suffix[1..]; // skip '-'
272      let next_n = after_dash
273         .bytes()
274         .take_while(|&b| b.is_ascii_alphabetic())
275         .count();
276      if next_n > 0 {
277         let inner = after_dash[..next_n].to_ascii_lowercase();
278         let tail = &after_dash[next_n..]; // e.g. "" or "-checking"
279
280         let inner_past = present_to_past(&inner)
281            .or_else(|| inner.strip_suffix('s').and_then(|s| present_to_past(s)))
282            .or_else(|| inner.strip_suffix("es").and_then(|s| present_to_past(s)))
283            .or_else(|| {
284               inner
285                  .strip_suffix("ies")
286                  .and_then(|s| present_to_past(&format!("{s}y")))
287            })
288            .map(|p| {
289               if commit_type == "refactor" && p == "refactored" {
290                  "restructured"
291               } else {
292                  p
293               }
294            });
295
296         if let Some(past) = inner_past {
297            *summary = if rest.is_empty() {
298               format!("re-{past}{tail}")
299            } else {
300               format!("re-{past}{tail} {rest}")
301            };
302         }
303      }
304      return;
305   }
306
307   // Normal case: look up the stem directly.
308   let past = present_to_past(&stem)
309      .or_else(|| {
310         // Strip trailing 's' for third-person present (adds -> add).
311         stem.strip_suffix('s').and_then(|s| present_to_past(s))
312      })
313      .or_else(|| {
314         // Strip trailing 'es' for verbs ending in s/sh/ch/x/z (fixes -> fix).
315         stem.strip_suffix("es").and_then(|s| present_to_past(s))
316      })
317      .or_else(|| {
318         // -ies -> -y (simplifies -> simplify, applies -> apply).
319         stem
320            .strip_suffix("ies")
321            .and_then(|s| present_to_past(&format!("{s}y")))
322      })
323      .map(|p| {
324         // Special case: refactor type shouldn't use "refactored"
325         if commit_type == "refactor" && p == "refactored" {
326            "restructured"
327         } else {
328            p
329         }
330      });
331
332   if let Some(past) = past {
333      *summary = if rest.is_empty() {
334         format!("{past}{safe_suffix}")
335      } else {
336         format!("{past}{safe_suffix} {rest}")
337      };
338   }
339}
340
341/// Post-process conventional commit message to fix common issues
342pub fn post_process_commit_message(msg: &mut ConventionalCommit, config: &CommitConfig) {
343   // CommitType and Scope are already normalized to lowercase in their
344   // constructors No need to re-normalize them here
345
346   // Extract summary string for mutations, will reconstruct at end
347   let mut summary_str = normalize_unicode(msg.summary.as_str());
348
349   // Normalize body and footers
350   msg.body = msg.body.iter().map(|s| normalize_unicode(s)).collect();
351   msg.footers = msg.footers.iter().map(|s| normalize_unicode(s)).collect();
352
353   // Normalize summary formatting: single line, trimmed, enforce trailing period
354   summary_str = summary_str
355      .replace(['\r', '\n'], " ")
356      .split_whitespace()
357      .collect::<Vec<_>>()
358      .join(" ")
359      .trim()
360      .trim_end_matches('.')
361      .trim_end_matches(';')
362      .trim_end_matches(':')
363      .to_string();
364
365   // Helper: check if first token is all caps (acronym/initialism)
366   let is_first_token_all_caps = |s: &str| -> bool {
367      s.split_whitespace().next().is_some_and(|token| {
368         token
369            .chars()
370            .all(|c| !c.is_alphabetic() || c.is_uppercase())
371      })
372   };
373
374   // Ensure summary starts with lowercase (unless first token is all caps)
375   if !is_first_token_all_caps(&summary_str)
376      && let Some(first_char) = summary_str.chars().next()
377      && first_char.is_uppercase()
378   {
379      let rest = &summary_str[first_char.len_utf8()..];
380      summary_str = format!("{}{}", first_char.to_lowercase(), rest);
381   }
382
383   // Normalize verb tense (present \u{2192} past, handle type-specific
384   // replacements)
385   normalize_summary_verb(&mut summary_str, msg.commit_type.as_str());
386   summary_str = summary_str.trim().to_string();
387
388   // Ensure lowercase after normalization (unless first token is all caps)
389   if !is_first_token_all_caps(&summary_str)
390      && let Some(first_char) = summary_str.chars().next()
391      && first_char.is_uppercase()
392   {
393      let rest = &summary_str[first_char.len_utf8()..];
394      summary_str = format!("{}{}", first_char.to_lowercase(), rest);
395   }
396
397   // No truncation - validation handles length checks
398   // Remove any trailing period (conventional commits don't use periods)
399   summary_str = summary_str.trim_end_matches('.').to_string();
400
401   // Reconstruct CommitSummary (bypassing warnings since post-processing
402   // normalizes)
403   msg.summary = crate::types::CommitSummary::new_unchecked(summary_str, 128)
404      .expect("post-processed summary should be valid");
405
406   // Clean and enforce punctuation for body items
407   for item in &mut msg.body {
408      let mut cleaned = item
409         .replace(['\r', '\n'], " ")
410         .trim()
411         .trim_start_matches('\u{2022}')
412         .trim_start_matches('-')
413         .trim_start_matches('*')
414         .trim_start_matches('+')
415         .trim()
416         .to_string();
417
418      cleaned = cleaned
419         .split_whitespace()
420         .collect::<Vec<_>>()
421         .join(" ")
422         .trim()
423         .trim_end_matches('.')
424         .trim_end_matches(';')
425         .trim_end_matches(',')
426         .to_string();
427
428      if cleaned.is_empty() {
429         *item = cleaned;
430         continue;
431      }
432
433      // Capitalize first letter
434      if let Some(first_char) = cleaned.chars().next()
435         && first_char.is_lowercase()
436      {
437         let rest = &cleaned[first_char.len_utf8()..];
438         cleaned = format!("{}{}", first_char.to_uppercase(), rest);
439      }
440
441      if !cleaned.ends_with('.') {
442         cleaned.push('.');
443      }
444
445      *item = cleaned;
446   }
447
448   // Remove empty body items
449   msg.body.retain(|item| !item.trim().is_empty());
450
451   // Cap details by token budget
452   cap_details(&mut msg.body, config.max_detail_tokens);
453}
454
455/// Format `ConventionalCommit` as a single string for display and commit
456pub fn format_commit_message(msg: &ConventionalCommit) -> String {
457   // Build first line: type(scope): summary
458   let scope_part = msg
459      .scope
460      .as_ref()
461      .map(|s| format!("({s})"))
462      .unwrap_or_default();
463   let first_line = format!("{}{}: {}", msg.commit_type, scope_part, msg.summary);
464
465   // Build body with - bullets
466   let body_formatted = if msg.body.is_empty() {
467      String::new()
468   } else {
469      msg.body
470         .iter()
471         .map(|item| format!("- {item}"))
472         .collect::<Vec<_>>()
473         .join("\n")
474   };
475
476   // Build footers
477   let footers_formatted = if msg.footers.is_empty() {
478      String::new()
479   } else {
480      msg.footers.join("\n")
481   };
482
483   // Combine parts
484   let mut result = first_line;
485   if !body_formatted.is_empty() {
486      result.push_str("\n\n");
487      result.push_str(&body_formatted);
488   }
489   if !footers_formatted.is_empty() {
490      result.push_str("\n\n");
491      result.push_str(&footers_formatted);
492   }
493   result
494}
495
496#[cfg(test)]
497mod tests {
498   use super::*;
499   use crate::types::{CommitSummary, CommitType, ConventionalCommit, Scope};
500
501   // normalize_unicode tests
502   #[test]
503   fn test_normalize_unicode_smart_quotes() {
504      assert_eq!(normalize_unicode("\u{2018}smart quotes\u{2019}"), "'smart quotes'");
505      assert_eq!(normalize_unicode("\u{201C}double quotes\u{201D}"), "\"double quotes\"");
506      assert_eq!(normalize_unicode("\u{201A}low quote\u{2019}"), "'low quote'");
507      assert_eq!(normalize_unicode("\u{201E}low double\u{201D}"), "\"low double\"");
508   }
509
510   #[test]
511   fn test_normalize_unicode_dashes() {
512      assert_eq!(normalize_unicode("en\u{2013}dash"), "en--dash");
513      assert_eq!(normalize_unicode("em\u{2014}dash"), "em--dash");
514      assert_eq!(normalize_unicode("fig\u{2012}dash"), "fig-dash");
515      assert_eq!(normalize_unicode("minus\u{2212}sign"), "minus-sign");
516   }
517
518   #[test]
519   fn test_normalize_unicode_arrows() {
520      assert_eq!(normalize_unicode("arrow\u{2192}right"), "arrow->right");
521      assert_eq!(normalize_unicode("arrow\u{2190}left"), "arrow<-left");
522      assert_eq!(normalize_unicode("arrow\u{2194}both"), "arrow<->both");
523      assert_eq!(normalize_unicode("double\u{21D2}arrow"), "double=>arrow");
524      assert_eq!(normalize_unicode("up\u{2191}arrow"), "up^arrow");
525   }
526
527   #[test]
528   fn test_normalize_unicode_math() {
529      assert_eq!(normalize_unicode("a\u{00D7}b"), "axb");
530      assert_eq!(normalize_unicode("a\u{00F7}b"), "a/b");
531      assert_eq!(normalize_unicode("x\u{2264}y"), "x<=y");
532      assert_eq!(normalize_unicode("x\u{2265}y"), "x>=y");
533      assert_eq!(normalize_unicode("x\u{2260}y"), "x!=y");
534      assert_eq!(normalize_unicode("x\u{2248}y"), "x~=y");
535   }
536
537   #[test]
538   fn test_normalize_unicode_greek() {
539      assert_eq!(normalize_unicode("\u{03BB} function"), "lambda function");
540      assert_eq!(normalize_unicode("\u{03B1} beta \u{03B3}"), "alpha beta gamma");
541      assert_eq!(normalize_unicode("\u{03BC} service"), "mu service");
542      assert_eq!(normalize_unicode("\u{03A3} total"), "Sigma total");
543   }
544
545   #[test]
546   fn test_normalize_unicode_fractions() {
547      assert_eq!(normalize_unicode("\u{00BD} cup"), "1/2 cup");
548      assert_eq!(normalize_unicode("\u{00BE} done"), "3/4 done");
549      assert_eq!(normalize_unicode("\u{2153} left"), "1/3 left");
550   }
551
552   #[test]
553   fn test_normalize_unicode_superscripts() {
554      assert_eq!(normalize_unicode("x\u{00B2}"), "x^2");
555      assert_eq!(normalize_unicode("10\u{00B3}"), "10^3");
556   }
557
558   #[test]
559   fn test_normalize_unicode_multiple_replacements() {
560      let input =
561         "\u{2018}smart\u{2019}\u{2192}straight \u{201C}quotes\u{201D}\u{00D7}math\u{2264}ops";
562      let expected = "'smart'->straight \"quotes\"xmath<=ops";
563      assert_eq!(normalize_unicode(input), expected);
564   }
565
566   #[test]
567   fn test_normalize_unicode_ellipsis() {
568      assert_eq!(normalize_unicode("wait\u{2026}"), "wait...");
569      assert_eq!(normalize_unicode("more\u{22EF}dots"), "more...dots");
570   }
571
572   #[test]
573   fn test_normalize_unicode_bullets() {
574      assert_eq!(normalize_unicode("\u{2022}item"), "-item");
575      assert_eq!(normalize_unicode("\u{25E6}item"), "-item");
576   }
577
578   #[test]
579   fn test_normalize_unicode_check_marks() {
580      assert_eq!(normalize_unicode("\u{2713}done"), "vdone");
581      assert_eq!(normalize_unicode("\u{2717}failed"), "xfailed");
582   }
583
584   // normalize_summary_verb tests
585   #[test]
586   fn test_normalize_summary_verb_present_to_past() {
587      let mut s = "add new feature".to_string();
588      normalize_summary_verb(&mut s, "feat");
589      assert_eq!(s, "added new feature");
590
591      let mut s = "fix bug".to_string();
592      normalize_summary_verb(&mut s, "fix");
593      assert_eq!(s, "fixed bug");
594
595      let mut s = "update docs".to_string();
596      normalize_summary_verb(&mut s, "docs");
597      assert_eq!(s, "updated docs");
598   }
599
600   #[test]
601   fn test_normalize_summary_verb_already_past() {
602      let mut s = "added feature".to_string();
603      normalize_summary_verb(&mut s, "feat");
604      assert_eq!(s, "added feature");
605
606      let mut s = "fixed bug".to_string();
607      normalize_summary_verb(&mut s, "fix");
608      assert_eq!(s, "fixed bug");
609   }
610
611   #[test]
612   fn test_normalize_summary_verb_third_person() {
613      let mut s = "adds feature".to_string();
614      normalize_summary_verb(&mut s, "feat");
615      assert_eq!(s, "added feature");
616
617      let mut s = "fixes bug".to_string();
618      normalize_summary_verb(&mut s, "fix");
619      assert_eq!(s, "fixed bug");
620   }
621
622   #[test]
623   fn test_normalize_summary_verb_non_verb_start() {
624      let mut s = "123 files changed".to_string();
625      normalize_summary_verb(&mut s, "chore");
626      assert_eq!(s, "123 files changed");
627   }
628
629   #[test]
630   fn test_normalize_summary_verb_refactor_special_case() {
631      let mut s = "refactored code".to_string();
632      normalize_summary_verb(&mut s, "refactor");
633      assert_eq!(s, "restructured code");
634   }
635
636   #[test]
637   fn test_normalize_summary_verb_refactor_present() {
638      let mut s = "refactor code".to_string();
639      normalize_summary_verb(&mut s, "refactor");
640      assert_eq!(s, "restructured code");
641
642      let mut s = "refactor logic".to_string();
643      normalize_summary_verb(&mut s, "feat");
644      assert_eq!(s, "refactored logic");
645   }
646
647   #[test]
648   fn test_normalize_summary_verb_empty() {
649      let mut s = String::new();
650      normalize_summary_verb(&mut s, "feat");
651      assert_eq!(s, "");
652   }
653
654   #[test]
655   fn test_normalize_summary_verb_single_word() {
656      let mut s = "add".to_string();
657      normalize_summary_verb(&mut s, "feat");
658      assert_eq!(s, "added");
659   }
660
661   #[test]
662   fn test_normalize_summary_verb_harden_to_hardened() {
663      let mut s = "harden stealth scripts against detection".to_string();
664      normalize_summary_verb(&mut s, "fix");
665      assert_eq!(s, "hardened stealth scripts against detection");
666   }
667
668   #[test]
669   fn test_normalize_summary_verb_bind_to_bound() {
670      let mut s = "bind native methods to local constants".to_string();
671      normalize_summary_verb(&mut s, "fix");
672      assert_eq!(s, "bound native methods to local constants");
673   }
674
675   #[test]
676   fn test_normalize_summary_verb_third_person_ies() {
677      // -ies -> -y conversion (simplifies -> simplify -> simplified)
678      let mut s = "simplifies the config loading".to_string();
679      normalize_summary_verb(&mut s, "refactor");
680      assert_eq!(s, "simplified the config loading");
681   }
682
683   #[test]
684   fn test_normalize_summary_verb_third_person_es() {
685      // -es stripping (fixes -> fix -> fixed)
686      let mut s = "fixes race condition".to_string();
687      normalize_summary_verb(&mut s, "fix");
688      assert_eq!(s, "fixed race condition");
689   }
690
691   #[test]
692   fn test_normalize_summary_verb_suffix_reattach_dash() {
693      // Dash suffix should be reattached after conversion
694      let mut s = "isolate-subagent from main flow".to_string();
695      normalize_summary_verb(&mut s, "refactor");
696      assert_eq!(s, "isolated-subagent from main flow");
697   }
698
699   #[test]
700   fn test_normalize_summary_verb_skip_type_prefix_leak() {
701      // `fix(tui):` is a leaked conventional prefix, NOT a verb to convert.
702      // The `(tui):` suffix is not a dash/slash separator, so we skip.
703      let mut s = "fix(tui): rendering bug".to_string();
704      normalize_summary_verb(&mut s, "fix");
705      assert_eq!(s, "fix(tui): rendering bug");
706   }
707
708   #[test]
709   fn test_normalize_summary_verb_skip_acronym() {
710      // All-caps acronyms should not be converted
711      let mut s = "API response handling".to_string();
712      normalize_summary_verb(&mut s, "feat");
713      assert_eq!(s, "API response handling");
714   }
715
716   #[test]
717   fn test_normalize_summary_verb_skip_numeric() {
718      // Numeric-led tokens should not be converted
719      let mut s = "403 error handling".to_string();
720      normalize_summary_verb(&mut s, "fix");
721      assert_eq!(s, "403 error handling");
722   }
723
724   #[test]
725   fn test_normalize_summary_verb_already_past_hardened() {
726      // Already past tense should not be re-converted
727      let mut s = "hardened stealth scripts".to_string();
728      normalize_summary_verb(&mut s, "fix");
729      assert_eq!(s, "hardened stealth scripts");
730   }
731
732   #[test]
733   fn test_normalize_summary_verb_already_past_bound() {
734      let mut s = "bound native methods".to_string();
735      normalize_summary_verb(&mut s, "fix");
736      assert_eq!(s, "bound native methods");
737   }
738
739   #[test]
740   fn test_normalize_summary_verb_preserves_existing_third_person() {
741      // The old test had "adds" and "fixes" - ensure they still work
742      let mut s = "adds feature".to_string();
743      normalize_summary_verb(&mut s, "feat");
744      assert_eq!(s, "added feature");
745
746      let mut s = "fixes bug".to_string();
747      normalize_summary_verb(&mut s, "fix");
748      assert_eq!(s, "fixed bug");
749
750      let mut s = "updates docs".to_string();
751      normalize_summary_verb(&mut s, "docs");
752      assert_eq!(s, "updated docs");
753   }
754
755   #[test]
756   fn test_normalize_summary_verb_re_prefix_enable() {
757      let mut s = "re-enable formatting checks".to_string();
758      normalize_summary_verb(&mut s, "fix");
759      assert_eq!(s, "re-enabled formatting checks");
760   }
761
762   #[test]
763   fn test_normalize_summary_verb_re_prefix_run() {
764      let mut s = "re-run the test suite".to_string();
765      normalize_summary_verb(&mut s, "fix");
766      assert_eq!(s, "re-ran the test suite");
767   }
768
769   #[test]
770   fn test_normalize_summary_verb_re_prefix_with_tail() {
771      // re-format-checking -> re-formatted-checking
772      let mut s = "re-format-checking pipeline".to_string();
773      normalize_summary_verb(&mut s, "fix");
774      assert_eq!(s, "re-formatted-checking pipeline");
775   }
776
777   #[test]
778   fn test_normalize_summary_verb_re_prefix_already_past() {
779      // re-enabled is already past tense, should not be re-converted
780      let mut s = "re-enabled linting".to_string();
781      normalize_summary_verb(&mut s, "fix");
782      assert_eq!(s, "re-enabled linting");
783   }
784
785   // cap_details tests (budget-based)
786   #[test]
787   fn test_cap_details_under_budget() {
788      let mut details = vec!["first".to_string(), "second".to_string(), "third".to_string()];
789      let tokens: usize = details.iter().map(|d| estimate_tokens(d)).sum();
790      cap_details(&mut details, tokens + 100);
791      assert_eq!(details.len(), 3);
792   }
793
794   #[test]
795   fn test_cap_details_at_budget() {
796      let mut details = vec![
797         "one".to_string(),
798         "two".to_string(),
799         "three".to_string(),
800         "four".to_string(),
801         "five".to_string(),
802         "six".to_string(),
803      ];
804      let tokens: usize = details.iter().map(|d| estimate_tokens(d)).sum();
805      cap_details(&mut details, tokens);
806      assert_eq!(details.len(), 6);
807   }
808
809   #[test]
810   fn test_cap_details_security_priority() {
811      let mut details = vec![
812         "normal change".to_string(),
813         "security vulnerability fixed".to_string(),
814         "another change".to_string(),
815         "third change".to_string(),
816         "fourth change".to_string(),
817         "fifth change".to_string(),
818         "sixth change".to_string(),
819      ];
820      // Budget for ~4 typical items (15 chars each = ~4 tokens, 4*4 = 16 tokens)
821      cap_details(&mut details, 60);
822      assert!(details.iter().any(|d| d.contains("security")));
823   }
824
825   #[test]
826   fn test_cap_details_performance_priority() {
827      let mut details = vec![
828         "normal change".to_string(),
829         "performance optimization added".to_string(),
830         "another change".to_string(),
831         "third change".to_string(),
832         "fourth change".to_string(),
833         "fifth change".to_string(),
834      ];
835      // Budget for ~3 typical items
836      cap_details(&mut details, 40);
837      assert!(details.iter().any(|d| d.contains("performance")));
838   }
839
840   #[test]
841   fn test_cap_details_api_priority() {
842      let mut details = vec![
843         "normal change".to_string(),
844         "API interface updated".to_string(),
845         "internal change".to_string(),
846         "another internal change".to_string(),
847         "yet another change".to_string(),
848      ];
849      // Budget for ~3 items
850      cap_details(&mut details, 50);
851      assert!(details.iter().any(|d| d.contains("API")));
852   }
853
854   #[test]
855   fn test_cap_details_preserves_order() {
856      let mut details = vec![
857         "first".to_string(),
858         "critical security fix".to_string(),
859         "third".to_string(),
860         "performance improvement".to_string(),
861         "fifth".to_string(),
862      ];
863      // Budget for ~3 items
864      cap_details(&mut details, 50);
865      // Should preserve relative order of kept items
866      let security_idx = details.iter().position(|d| d.contains("security"));
867      let perf_idx = details.iter().position(|d| d.contains("performance"));
868      assert!(security_idx.unwrap() < perf_idx.unwrap());
869   }
870
871   #[test]
872   fn test_cap_details_empty_list() {
873      let mut details: Vec<String> = vec![];
874      cap_details(&mut details, 100);
875      assert_eq!(details.len(), 0);
876   }
877
878   #[test]
879   fn test_cap_details_breaking_priority() {
880      let mut details = vec![
881         "normal change".to_string(),
882         "breaking change introduced".to_string(),
883         "another change".to_string(),
884         "third change".to_string(),
885         "fourth change".to_string(),
886      ];
887      // Budget for ~3 items
888      cap_details(&mut details, 50);
889      assert!(details.iter().any(|d| d.contains("breaking")));
890   }
891
892   #[test]
893   fn test_cap_details_budget_prefers_short_high_priority() {
894      // 6 short high-priority items should fit, but 2 long low-priority shouldn't
895      let mut details = vec![
896         "security fix".to_string(),     // ~12 chars, ~3 tokens, score 100
897         "bug fix".to_string(),          // ~7 chars, ~2 tokens, score 70
898         "API change".to_string(),       // ~10 chars, ~3 tokens, score 50
899         "performance gain".to_string(), // ~16 chars, ~4 tokens, score 80
900         "breaking change".to_string(),  // ~15 chars, ~4 tokens, score 90
901         "user feature".to_string(),     // ~12 chars, ~3 tokens, score 40
902         "This is a very long internal refactoring detail that adds no user value".to_string(), /* ~73 chars, ~19 tokens, score 0 */
903         "Another extremely long low priority change description here".to_string(), /* ~61 chars, ~16 tokens, score 0 */
904      ];
905      // Budget: 30 tokens (enough for all 6 short items, not enough for long ones)
906      cap_details(&mut details, 30);
907      // Should keep short high-priority items
908      assert!(details.iter().any(|d| d.contains("security")));
909      assert!(details.iter().any(|d| d.contains("breaking")));
910      // Should drop long low-priority items
911      assert!(!details.iter().any(|d| d.contains("very long internal")));
912   }
913
914   #[test]
915   fn test_cap_details_budget_allows_variable_count() {
916      // With same budget, should fit more short items or fewer long items
917      let short_details = vec![
918         "fix A".to_string(),
919         "fix B".to_string(),
920         "fix C".to_string(),
921         "fix D".to_string(),
922         "fix E".to_string(),
923         "fix F".to_string(),
924      ];
925      let long_details = vec![
926         "Fixed a critical security vulnerability in authentication".to_string(),
927         "Implemented comprehensive performance optimization".to_string(),
928         "Added extensive API documentation and examples".to_string(),
929      ];
930
931      let mut short = short_details;
932      let mut long = long_details;
933
934      cap_details(&mut short, 50); // Should fit all 6 short items (~2 tokens each)
935      cap_details(&mut long, 50); // Should fit only 2-3 long items (~13-15 tokens each)
936
937      assert!(short.len() >= 5); // Most short items fit
938      assert!(long.len() <= 3); // Fewer long items fit
939   }
940
941   // format_commit_message tests
942   #[test]
943   fn test_format_commit_message_type_summary_only() {
944      let commit = ConventionalCommit {
945         commit_type: CommitType::new("feat").unwrap(),
946         scope:       None,
947         summary:     CommitSummary::new_unchecked("added new feature", 128).unwrap(),
948         body:        vec![],
949         footers:     vec![],
950      };
951      assert_eq!(format_commit_message(&commit), "feat: added new feature");
952   }
953
954   #[test]
955   fn test_format_commit_message_with_scope() {
956      let commit = ConventionalCommit {
957         commit_type: CommitType::new("fix").unwrap(),
958         scope:       Some(Scope::new("api").unwrap()),
959         summary:     CommitSummary::new_unchecked("fixed bug", 128).unwrap(),
960         body:        vec![],
961         footers:     vec![],
962      };
963      assert_eq!(format_commit_message(&commit), "fix(api): fixed bug");
964   }
965
966   #[test]
967   fn test_format_commit_message_with_body() {
968      let commit = ConventionalCommit {
969         commit_type: CommitType::new("feat").unwrap(),
970         scope:       None,
971         summary:     CommitSummary::new_unchecked("added feature", 128).unwrap(),
972         body:        vec!["First detail.".to_string(), "Second detail.".to_string()],
973         footers:     vec![],
974      };
975      let expected = "feat: added feature\n\n- First detail.\n- Second detail.";
976      assert_eq!(format_commit_message(&commit), expected);
977   }
978
979   #[test]
980   fn test_format_commit_message_with_footers() {
981      let commit = ConventionalCommit {
982         commit_type: CommitType::new("fix").unwrap(),
983         scope:       None,
984         summary:     CommitSummary::new_unchecked("fixed bug", 128).unwrap(),
985         body:        vec![],
986         footers:     vec!["Closes: #123".to_string(), "Fixes: #456".to_string()],
987      };
988      let expected = "fix: fixed bug\n\nCloses: #123\nFixes: #456";
989      assert_eq!(format_commit_message(&commit), expected);
990   }
991
992   #[test]
993   fn test_format_commit_message_full() {
994      let commit = ConventionalCommit {
995         commit_type: CommitType::new("feat").unwrap(),
996         scope:       Some(Scope::new("auth").unwrap()),
997         summary:     CommitSummary::new_unchecked("added oauth support", 128).unwrap(),
998         body:        vec![
999            "Implemented OAuth2 flow.".to_string(),
1000            "Added token refresh.".to_string(),
1001         ],
1002         footers:     vec!["Closes: #789".to_string()],
1003      };
1004      let expected = "feat(auth): added oauth support\n\n- Implemented OAuth2 flow.\n- Added \
1005                      token refresh.\n\nCloses: #789";
1006      assert_eq!(format_commit_message(&commit), expected);
1007   }
1008
1009   #[test]
1010   fn test_format_commit_message_nested_scope() {
1011      let commit = ConventionalCommit {
1012         commit_type: CommitType::new("refactor").unwrap(),
1013         scope:       Some(Scope::new("api/client").unwrap()),
1014         summary:     CommitSummary::new_unchecked("restructured code", 128).unwrap(),
1015         body:        vec![],
1016         footers:     vec![],
1017      };
1018      assert_eq!(format_commit_message(&commit), "refactor(api/client): restructured code");
1019   }
1020}
llm_git/normalization.rs

llm_git/
normalization.rs