llm_git/
map_reduce.rs

1//! Map-reduce pattern for large diff analysis
2//!
3//! When diffs exceed the token threshold, this module splits analysis across
4//! files, then synthesizes results for accurate classification.
5
6use std::path::Path;
7
8use futures::stream::{self, StreamExt};
9use serde::{Deserialize, Serialize};
10
11use crate::{
12   api::retry_api_call,
13   config::{CommitConfig, ResolvedApiMode},
14   diff::{FileDiff, parse_diff, reconstruct_diff},
15   error::{CommitGenError, Result},
16   templates,
17   tokens::TokenCounter,
18   types::ConventionalAnalysis,
19};
20
21/// Observation from a single file during map phase
22#[derive(Debug, Clone, Serialize, Deserialize)]
23pub struct FileObservation {
24   pub file:         String,
25   pub observations: Vec<String>,
26   pub additions:    usize,
27   pub deletions:    usize,
28}
29
30/// Minimum files to justify map-reduce overhead (below this, unified is fine)
31const MIN_FILES_FOR_MAP_REDUCE: usize = 4;
32
33/// Maximum tokens per file in map phase (leave headroom for prompt template +
34/// context)
35const MAX_FILE_TOKENS: usize = 50_000;
36
37/// Check if map-reduce should be used
38/// Always use map-reduce except for:
39/// 1. Explicitly disabled in config
40/// 2. Very small diffs (≤3 files) where overhead isn't worth it
41pub fn should_use_map_reduce(diff: &str, config: &CommitConfig, counter: &TokenCounter) -> bool {
42   if !config.map_reduce_enabled {
43      return false;
44   }
45
46   let files = parse_diff(diff);
47   let file_count = files
48      .iter()
49      .filter(|f| {
50         !config
51            .excluded_files
52            .iter()
53            .any(|ex| f.filename.ends_with(ex))
54      })
55      .count();
56
57   // Use map-reduce for 4+ files, or if any single file would need truncation
58   file_count >= MIN_FILES_FOR_MAP_REDUCE
59      || files
60         .iter()
61         .any(|f| f.token_estimate(counter) > MAX_FILE_TOKENS)
62}
63
64/// Maximum files to include in context header (prevent token explosion)
65const MAX_CONTEXT_FILES: usize = 20;
66
67/// Generate context header summarizing other files for cross-file awareness
68fn generate_context_header(files: &[FileDiff], current_file: &str) -> String {
69   // Skip context header for very large commits (diminishing returns)
70   if files.len() > 100 {
71      return format!("(Large commit with {} total files)", files.len());
72   }
73
74   let mut lines = vec!["OTHER FILES IN THIS CHANGE:".to_string()];
75
76   let other_files: Vec<_> = files
77      .iter()
78      .filter(|f| f.filename != current_file)
79      .collect();
80
81   let total_other = other_files.len();
82
83   // Only show top files by change size if too many
84   let to_show: Vec<&FileDiff> = if total_other > MAX_CONTEXT_FILES {
85      let mut sorted = other_files;
86      sorted.sort_by_key(|f| std::cmp::Reverse(f.additions + f.deletions));
87      sorted.truncate(MAX_CONTEXT_FILES);
88      sorted
89   } else {
90      other_files
91   };
92
93   for file in &to_show {
94      let line_count = file.additions + file.deletions;
95      let description = infer_file_description(&file.filename, &file.content);
96      lines.push(format!("- {} ({} lines): {}", file.filename, line_count, description));
97   }
98
99   if to_show.len() < total_other {
100      lines.push(format!("... and {} more files", total_other - to_show.len()));
101   }
102
103   if lines.len() == 1 {
104      return String::new(); // No other files
105   }
106
107   lines.join("\n")
108}
109
110/// Infer a brief description of what a file likely contains based on
111/// name/content
112fn infer_file_description(filename: &str, content: &str) -> &'static str {
113   let filename_lower = filename.to_lowercase();
114
115   // Check filename patterns
116   if filename_lower.contains("test") {
117      return "test file";
118   }
119   if Path::new(filename)
120      .extension()
121      .is_some_and(|e| e.eq_ignore_ascii_case("md"))
122   {
123      return "documentation";
124   }
125   let ext = Path::new(filename).extension();
126   if filename_lower.contains("config")
127      || ext.is_some_and(|e| e.eq_ignore_ascii_case("toml"))
128      || ext.is_some_and(|e| e.eq_ignore_ascii_case("yaml"))
129      || ext.is_some_and(|e| e.eq_ignore_ascii_case("yml"))
130   {
131      return "configuration";
132   }
133   if filename_lower.contains("error") {
134      return "error definitions";
135   }
136   if filename_lower.contains("type") {
137      return "type definitions";
138   }
139   if filename_lower.ends_with("mod.rs") || filename_lower.ends_with("lib.rs") {
140      return "module exports";
141   }
142   if filename_lower.ends_with("main.rs")
143      || filename_lower.ends_with("main.go")
144      || filename_lower.ends_with("main.py")
145   {
146      return "entry point";
147   }
148
149   // Check content patterns
150   if content.contains("impl ") || content.contains("fn ") {
151      return "implementation";
152   }
153   if content.contains("struct ") || content.contains("enum ") {
154      return "type definitions";
155   }
156   if content.contains("async ") || content.contains("await") {
157      return "async code";
158   }
159
160   "source code"
161}
162
163/// Map phase: analyze each file individually and extract observations
164async fn map_phase(
165   files: &[FileDiff],
166   model_name: &str,
167   config: &CommitConfig,
168   counter: &TokenCounter,
169) -> Result<Vec<FileObservation>> {
170   // Process files concurrently using futures stream
171   let observations: Vec<Result<FileObservation>> = stream::iter(files.iter())
172      .map(|file| async {
173         if file.is_binary {
174            return Ok(FileObservation {
175               file:         file.filename.clone(),
176               observations: vec!["Binary file changed.".to_string()],
177               additions:    0,
178               deletions:    0,
179            });
180         }
181
182         let context_header = generate_context_header(files, &file.filename);
183         // Truncate large files to fit API limits
184         let mut file_clone = file.clone();
185         let file_tokens = file_clone.token_estimate(counter);
186         if file_tokens > MAX_FILE_TOKENS {
187            let target_size = MAX_FILE_TOKENS * 4; // Convert tokens to chars
188            file_clone.truncate(target_size);
189            eprintln!(
190               "  {} truncated {} ({} \u{2192} {} tokens)",
191               crate::style::icons::WARNING,
192               file.filename,
193               file_tokens,
194               file_clone.token_estimate(counter)
195            );
196         }
197
198         let file_diff = reconstruct_diff(&[file_clone]);
199
200         map_single_file(&file.filename, &file_diff, &context_header, model_name, config).await
201      })
202      .buffer_unordered(8)
203      .collect()
204      .await;
205
206   // Collect results, failing fast on first error
207   observations.into_iter().collect()
208}
209
210/// Analyze a single file and extract observations
211async fn map_single_file(
212   filename: &str,
213   file_diff: &str,
214   context_header: &str,
215   model_name: &str,
216   config: &CommitConfig,
217) -> Result<FileObservation> {
218   retry_api_call(config, async move || {
219      let client = crate::api::get_client(config);
220
221      let tool = build_observation_tool();
222
223      let parts = templates::render_map_prompt("default", filename, file_diff, context_header)?;
224      let mode = config.resolved_api_mode(model_name);
225
226      let response_text = match mode {
227         ResolvedApiMode::ChatCompletions => {
228            let request = build_api_request(
229               config,
230               model_name,
231               config.temperature,
232               vec![tool],
233               &parts.system,
234               &parts.user,
235               "map",
236               "default",
237            );
238
239            let mut request_builder = client
240               .post(format!("{}/chat/completions", config.api_base_url))
241               .header("content-type", "application/json");
242
243            if let Some(api_key) = &config.api_key {
244               request_builder =
245                  request_builder.header("Authorization", format!("Bearer {api_key}"));
246            }
247
248            let (status, response_text) =
249               crate::api::timed_send(request_builder.json(&request), "map-reduce/map", model_name).await?;
250
251            if status.is_server_error() {
252               eprintln!(
253                  "{}",
254                  crate::style::error(&format!("Server error {status}: {response_text}"))
255               );
256               return Ok((true, None)); // Retry
257            }
258
259            if !status.is_success() {
260               return Err(CommitGenError::ApiError {
261                  status: status.as_u16(),
262                  body:   response_text,
263               });
264            }
265
266            response_text
267         },
268         ResolvedApiMode::AnthropicMessages => {
269            let prompt_caching = anthropic_prompt_caching_enabled(config);
270            let mut tools = vec![AnthropicTool {
271               name:         "create_file_observation".to_string(),
272               description:  "Extract observations from a single file's changes".to_string(),
273               input_schema: serde_json::json!({
274                  "type": "object",
275                  "properties": {
276                     "observations": {
277                        "type": "array",
278                        "description": "List of factual observations about what changed in this file",
279                        "items": {"type": "string"}
280                     }
281                  },
282                  "required": ["observations"]
283               }),
284               cache_control: None,
285            }];
286            cache_last_anthropic_tool(&mut tools, prompt_caching);
287
288            let request = AnthropicRequest {
289               model:       model_name.to_string(),
290               max_tokens:  1500,
291               temperature: config.temperature,
292               system:      anthropic_system_content(&parts.system, prompt_caching),
293               tools,
294               tool_choice: Some(AnthropicToolChoice {
295                  choice_type: "tool".to_string(),
296                  name:        "create_file_observation".to_string(),
297               }),
298               messages:    vec![AnthropicMessage {
299                  role:    "user".to_string(),
300                  content: vec![anthropic_text_content(parts.user, false)],
301               }],
302            };
303
304            let mut request_builder = append_anthropic_cache_beta_header(
305               client
306                  .post(anthropic_messages_url(&config.api_base_url))
307                  .header("content-type", "application/json")
308                  .header("anthropic-version", "2023-06-01"),
309               prompt_caching,
310            );
311
312            if let Some(api_key) = &config.api_key {
313               request_builder = request_builder.header("x-api-key", api_key);
314            }
315
316            let (status, response_text) =
317               crate::api::timed_send(request_builder.json(&request), "map-reduce/map", model_name).await?;
318
319            if status.is_server_error() {
320               eprintln!(
321                  "{}",
322                  crate::style::error(&format!("Server error {status}: {response_text}"))
323               );
324               return Ok((true, None)); // Retry
325            }
326
327            if !status.is_success() {
328               return Err(CommitGenError::ApiError {
329                  status: status.as_u16(),
330                  body:   response_text,
331               });
332            }
333
334            response_text
335         },
336      };
337
338      if response_text.trim().is_empty() {
339         crate::style::warn("Model returned empty response body for observation; retrying.");
340         return Ok((true, None));
341      }
342
343      match mode {
344         ResolvedApiMode::ChatCompletions => {
345            let api_response: ApiResponse = serde_json::from_str(&response_text).map_err(|e| {
346               CommitGenError::Other(format!(
347                  "Failed to parse observation response JSON: {e}. Response body: {}",
348                  response_snippet(&response_text, 500)
349               ))
350            })?;
351
352            if api_response.choices.is_empty() {
353               return Err(CommitGenError::Other(
354                  "API returned empty response for file observation".to_string(),
355               ));
356            }
357
358            let message = &api_response.choices[0].message;
359
360            if !message.tool_calls.is_empty() {
361               let tool_call = &message.tool_calls[0];
362               if tool_call.function.name.ends_with("create_file_observation") {
363                  let args = &tool_call.function.arguments;
364                  if args.is_empty() {
365                     return Err(CommitGenError::Other(
366                        "Model returned empty function arguments for observation".to_string(),
367                     ));
368                  }
369
370                  let obs: FileObservationResponse = serde_json::from_str(args).map_err(|e| {
371                     CommitGenError::Other(format!("Failed to parse observation response: {e}"))
372                  })?;
373
374                  return Ok((
375                     false,
376                     Some(FileObservation {
377                        file:         filename.to_string(),
378                        observations: obs.observations,
379                        additions:    0, // Will be filled from FileDiff
380                        deletions:    0,
381                     }),
382                  ));
383               }
384            }
385
386            // Fallback: try to parse content
387            if let Some(content) = &message.content {
388               if content.trim().is_empty() {
389                  crate::style::warn("Model returned empty content for observation; retrying.");
390                  return Ok((true, None));
391               }
392               let obs: FileObservationResponse =
393                  serde_json::from_str(content.trim()).map_err(|e| {
394                     CommitGenError::Other(format!(
395                        "Failed to parse observation content JSON: {e}. Content: {}",
396                        response_snippet(content, 500)
397                     ))
398                  })?;
399               return Ok((
400                  false,
401                  Some(FileObservation {
402                     file:         filename.to_string(),
403                     observations: obs.observations,
404                     additions:    0,
405                     deletions:    0,
406                  }),
407               ));
408            }
409
410            Err(CommitGenError::Other("No observation found in API response".to_string()))
411         },
412         ResolvedApiMode::AnthropicMessages => {
413            let (tool_input, text_content, stop_reason) =
414               extract_anthropic_content(&response_text, "create_file_observation")?;
415
416            if let Some(input) = tool_input {
417               let mut observations = match input.get("observations") {
418                  Some(serde_json::Value::Array(arr)) => arr
419                     .iter()
420                     .filter_map(|v| v.as_str().map(str::to_string))
421                     .collect::<Vec<_>>(),
422                  Some(serde_json::Value::String(s)) => parse_string_to_observations(s),
423                  _ => Vec::new(),
424               };
425
426               if observations.is_empty() {
427                  let text_observations = parse_observations_from_text(&text_content);
428                  if !text_observations.is_empty() {
429                     observations = text_observations;
430                  } else if stop_reason.as_deref() == Some("max_tokens") {
431                     crate::style::warn(
432                        "Anthropic stopped at max_tokens with empty observations; using fallback \
433                         observation.",
434                     );
435                     let fallback_target = Path::new(filename)
436                        .file_name()
437                        .and_then(|name| name.to_str())
438                        .unwrap_or(filename);
439                     observations = vec![format!("Updated {fallback_target}.")];
440                  } else {
441                     crate::style::warn(
442                        "Model returned empty observation tool input; continuing with no \
443                         observations.",
444                     );
445                  }
446               }
447
448               return Ok((
449                  false,
450                  Some(FileObservation {
451                     file: filename.to_string(),
452                     observations,
453                     additions: 0,
454                     deletions: 0,
455                  }),
456               ));
457            }
458
459            if text_content.trim().is_empty() {
460               crate::style::warn("Model returned empty content for observation; retrying.");
461               return Ok((true, None));
462            }
463
464            let obs: FileObservationResponse =
465               serde_json::from_str(text_content.trim()).map_err(|e| {
466                  CommitGenError::Other(format!(
467                     "Failed to parse observation content JSON: {e}. Content: {}",
468                     response_snippet(&text_content, 500)
469                  ))
470               })?;
471            Ok((
472               false,
473               Some(FileObservation {
474                  file:         filename.to_string(),
475                  observations: obs.observations,
476                  additions:    0,
477                  deletions:    0,
478               }),
479            ))
480         },
481      }
482   }).await
483}
484
485/// Reduce phase: synthesize all observations into final analysis
486pub async fn reduce_phase(
487   observations: &[FileObservation],
488   stat: &str,
489   scope_candidates: &str,
490   model_name: &str,
491   config: &CommitConfig,
492) -> Result<ConventionalAnalysis> {
493   retry_api_call(config, async move || {
494      let client = crate::api::get_client(config);
495
496      // Build type enum from config
497      let type_enum: Vec<&str> = config.types.keys().map(|s| s.as_str()).collect();
498
499      let tool = build_analysis_tool(&type_enum);
500
501      let observations_json =
502         serde_json::to_string_pretty(observations).unwrap_or_else(|_| "[]".to_string());
503
504      let types_description = crate::api::format_types_description(config);
505      let parts = templates::render_reduce_prompt(
506         "default",
507         &observations_json,
508         stat,
509         scope_candidates,
510         Some(&types_description),
511      )?;
512      let mode = config.resolved_api_mode(model_name);
513
514      let response_text = match mode {
515         ResolvedApiMode::ChatCompletions => {
516            let request = build_api_request(
517               config,
518               model_name,
519               config.temperature,
520               vec![tool],
521               &parts.system,
522               &parts.user,
523               "reduce",
524               "default",
525            );
526
527            let mut request_builder = client
528               .post(format!("{}/chat/completions", config.api_base_url))
529               .header("content-type", "application/json");
530
531            if let Some(api_key) = &config.api_key {
532               request_builder =
533                  request_builder.header("Authorization", format!("Bearer {api_key}"));
534            }
535
536            let (status, response_text) =
537               crate::api::timed_send(request_builder.json(&request), "map-reduce/reduce", model_name).await?;
538
539            if status.is_server_error() {
540               eprintln!(
541                  "{}",
542                  crate::style::error(&format!("Server error {status}: {response_text}"))
543               );
544               return Ok((true, None)); // Retry
545            }
546
547            if !status.is_success() {
548               return Err(CommitGenError::ApiError {
549                  status: status.as_u16(),
550                  body:   response_text,
551               });
552            }
553
554            response_text
555         },
556         ResolvedApiMode::AnthropicMessages => {
557            let prompt_caching = anthropic_prompt_caching_enabled(config);
558            let mut tools = vec![AnthropicTool {
559               name:         "create_conventional_analysis".to_string(),
560               description:  "Analyze changes and classify as conventional commit with type, \
561                              scope, details, and metadata"
562                  .to_string(),
563               input_schema: serde_json::json!({
564                     "type": "object",
565                     "properties": {
566                        "type": {
567                           "type": "string",
568                           "enum": type_enum,
569                           "description": "Commit type based on change classification"
570                        },
571                        "scope": {
572                           "type": "string",
573                           "description": "Optional scope (module/component). Omit if unclear or multi-component."
574                        },
575                        "details": {
576                           "type": "array",
577                           "description": "Array of 0-6 detail items with changelog metadata.",
578                           "items": {
579                              "type": "object",
580                              "properties": {
581                                 "text": {
582                                    "type": "string",
583                                    "description": "Detail about change, starting with past-tense verb, ending with period"
584                                 },
585                                 "changelog_category": {
586                                    "type": "string",
587                                    "enum": ["Added", "Changed", "Fixed", "Deprecated", "Removed", "Security"],
588                                    "description": "Changelog category if user-visible. Omit for internal changes."
589                                 },
590                                 "user_visible": {
591                                    "type": "boolean",
592                                    "description": "True if this change affects users/API and should appear in changelog"
593                                 }
594                              },
595                              "required": ["text", "user_visible"]
596                           }
597                        },
598                        "issue_refs": {
599                           "type": "array",
600                           "description": "Issue numbers from context (e.g., ['#123', '#456']). Empty if none.",
601                           "items": {
602                              "type": "string"
603                           }
604                        }
605                     },
606                     "required": ["type", "details", "issue_refs"]
607                  }),
608               cache_control: None,
609            }];
610            cache_last_anthropic_tool(&mut tools, prompt_caching);
611
612            let request = AnthropicRequest {
613               model:       model_name.to_string(),
614               max_tokens:  1500,
615               temperature: config.temperature,
616               system:      anthropic_system_content(&parts.system, prompt_caching),
617               tools,
618               tool_choice: Some(AnthropicToolChoice {
619                  choice_type: "tool".to_string(),
620                  name:        "create_conventional_analysis".to_string(),
621               }),
622               messages:    vec![AnthropicMessage {
623                  role:    "user".to_string(),
624                  content: vec![anthropic_text_content(parts.user, false)],
625               }],
626            };
627
628            let mut request_builder = append_anthropic_cache_beta_header(
629               client
630                  .post(anthropic_messages_url(&config.api_base_url))
631                  .header("content-type", "application/json")
632                  .header("anthropic-version", "2023-06-01"),
633               prompt_caching,
634            );
635
636            if let Some(api_key) = &config.api_key {
637               request_builder = request_builder.header("x-api-key", api_key);
638            }
639
640            let (status, response_text) =
641               crate::api::timed_send(request_builder.json(&request), "map-reduce/reduce", model_name).await?;
642
643            if status.is_server_error() {
644               eprintln!(
645                  "{}",
646                  crate::style::error(&format!("Server error {status}: {response_text}"))
647               );
648               return Ok((true, None));
649            }
650
651            if !status.is_success() {
652               return Err(CommitGenError::ApiError {
653                  status: status.as_u16(),
654                  body:   response_text,
655               });
656            }
657
658            response_text
659         },
660      };
661
662      if response_text.trim().is_empty() {
663         crate::style::warn("Model returned empty response body for synthesis; retrying.");
664         return Ok((true, None));
665      }
666
667      match mode {
668         ResolvedApiMode::ChatCompletions => {
669            let api_response: ApiResponse = serde_json::from_str(&response_text).map_err(|e| {
670               CommitGenError::Other(format!(
671                  "Failed to parse synthesis response JSON: {e}. Response body: {}",
672                  response_snippet(&response_text, 500)
673               ))
674            })?;
675
676            if api_response.choices.is_empty() {
677               return Err(CommitGenError::Other(
678                  "API returned empty response for synthesis".to_string(),
679               ));
680            }
681
682            let message = &api_response.choices[0].message;
683
684            if !message.tool_calls.is_empty() {
685               let tool_call = &message.tool_calls[0];
686               if tool_call
687                  .function
688                  .name
689                  .ends_with("create_conventional_analysis")
690               {
691                  let args = &tool_call.function.arguments;
692                  if args.is_empty() {
693                     return Err(CommitGenError::Other(
694                        "Model returned empty function arguments for synthesis".to_string(),
695                     ));
696                  }
697
698                  let analysis: ConventionalAnalysis = serde_json::from_str(args).map_err(|e| {
699                     CommitGenError::Other(format!("Failed to parse synthesis response: {e}"))
700                  })?;
701
702                  return Ok((false, Some(analysis)));
703               }
704            }
705
706            // Fallback
707            if let Some(content) = &message.content {
708               if content.trim().is_empty() {
709                  crate::style::warn("Model returned empty content for synthesis; retrying.");
710                  return Ok((true, None));
711               }
712               let analysis: ConventionalAnalysis =
713                  serde_json::from_str(content.trim()).map_err(|e| {
714                     CommitGenError::Other(format!(
715                        "Failed to parse synthesis content JSON: {e}. Content: {}",
716                        response_snippet(content, 500)
717                     ))
718                  })?;
719               return Ok((false, Some(analysis)));
720            }
721
722            Err(CommitGenError::Other("No analysis found in synthesis response".to_string()))
723         },
724         ResolvedApiMode::AnthropicMessages => {
725            let (tool_input, text_content, stop_reason) =
726               extract_anthropic_content(&response_text, "create_conventional_analysis")?;
727
728            if let Some(input) = tool_input {
729               let analysis: ConventionalAnalysis = serde_json::from_value(input).map_err(|e| {
730                  CommitGenError::Other(format!(
731                     "Failed to parse synthesis tool input: {e}. Response body: {}",
732                     response_snippet(&response_text, 500)
733                  ))
734               })?;
735               return Ok((false, Some(analysis)));
736            }
737
738            if text_content.trim().is_empty() {
739               if stop_reason.as_deref() == Some("max_tokens") {
740                  crate::style::warn(
741                     "Anthropic stopped at max_tokens with empty synthesis; retrying.",
742                  );
743                  return Ok((true, None));
744               }
745               crate::style::warn("Model returned empty content for synthesis; retrying.");
746               return Ok((true, None));
747            }
748
749            let analysis: ConventionalAnalysis = serde_json::from_str(text_content.trim())
750               .map_err(|e| {
751                  CommitGenError::Other(format!(
752                     "Failed to parse synthesis content JSON: {e}. Content: {}",
753                     response_snippet(&text_content, 500)
754                  ))
755               })?;
756            Ok((false, Some(analysis)))
757         },
758      }
759   }).await
760}
761
762/// Run full map-reduce pipeline for large diffs
763pub async fn run_map_reduce(
764   diff: &str,
765   stat: &str,
766   scope_candidates: &str,
767   model_name: &str,
768   config: &CommitConfig,
769   counter: &TokenCounter,
770) -> Result<ConventionalAnalysis> {
771   let mut files = parse_diff(diff);
772
773   // Filter excluded files
774   files.retain(|f| {
775      !config
776         .excluded_files
777         .iter()
778         .any(|excluded| f.filename.ends_with(excluded))
779   });
780
781   if files.is_empty() {
782      return Err(CommitGenError::Other(
783         "No relevant files to analyze after filtering".to_string(),
784      ));
785   }
786
787   let file_count = files.len();
788   crate::style::print_info(&format!("Running map-reduce on {file_count} files..."));
789
790   // Map phase
791   let observations = map_phase(&files, model_name, config, counter).await?;
792
793   // Reduce phase
794   reduce_phase(&observations, stat, scope_candidates, model_name, config).await
795}
796
797// ============================================================================
798// API types (duplicated from api.rs to avoid circular deps)
799// ============================================================================
800
801fn response_snippet(body: &str, limit: usize) -> String {
802   if body.is_empty() {
803      return "<empty response body>".to_string();
804   }
805   let mut snippet = body.trim().to_string();
806   if snippet.len() > limit {
807      snippet.truncate(limit);
808      snippet.push_str("...");
809   }
810   snippet
811}
812
813fn parse_observations_from_text(text: &str) -> Vec<String> {
814   let trimmed = text.trim();
815   if trimmed.is_empty() {
816      return Vec::new();
817   }
818
819   if let Ok(obs) = serde_json::from_str::<FileObservationResponse>(trimmed) {
820      return obs.observations;
821   }
822
823   trimmed
824      .lines()
825      .map(str::trim)
826      .filter(|line| !line.is_empty())
827      .map(|line| {
828         line
829            .strip_prefix("- ")
830            .or_else(|| line.strip_prefix("* "))
831            .unwrap_or(line)
832            .trim()
833      })
834      .filter(|line| !line.is_empty())
835      .map(str::to_string)
836      .collect()
837}
838
839fn anthropic_messages_url(base_url: &str) -> String {
840   let trimmed = base_url.trim_end_matches('/');
841   if trimmed.ends_with("/v1") {
842      format!("{trimmed}/messages")
843   } else {
844      format!("{trimmed}/v1/messages")
845   }
846}
847
848fn prompt_cache_control() -> PromptCacheControl {
849   PromptCacheControl { control_type: "ephemeral".to_string() }
850}
851
852fn anthropic_prompt_caching_enabled(config: &CommitConfig) -> bool {
853   config.api_base_url.to_lowercase().contains("anthropic.com")
854}
855
856fn append_anthropic_cache_beta_header(
857   request_builder: reqwest::RequestBuilder,
858   enable_cache: bool,
859) -> reqwest::RequestBuilder {
860   if enable_cache {
861      request_builder.header("anthropic-beta", "prompt-caching-2024-07-31")
862   } else {
863      request_builder
864   }
865}
866
867fn anthropic_text_content(text: String, cache: bool) -> AnthropicContent {
868   AnthropicContent {
869      content_type: "text".to_string(),
870      text,
871      cache_control: cache.then(prompt_cache_control),
872   }
873}
874
875fn anthropic_system_content(system_prompt: &str, cache: bool) -> Option<Vec<AnthropicContent>> {
876   if system_prompt.trim().is_empty() {
877      None
878   } else {
879      Some(vec![anthropic_text_content(system_prompt.to_string(), cache)])
880   }
881}
882
883fn cache_last_anthropic_tool(tools: &mut [AnthropicTool], cache: bool) {
884   if cache && let Some(last) = tools.last_mut() {
885      last.cache_control = Some(prompt_cache_control());
886   }
887}
888
889fn extract_anthropic_content(
890   response_text: &str,
891   tool_name: &str,
892) -> Result<(Option<serde_json::Value>, String, Option<String>)> {
893   let value: serde_json::Value = serde_json::from_str(response_text).map_err(|e| {
894      CommitGenError::Other(format!(
895         "Failed to parse Anthropic response JSON: {e}. Response body: {}",
896         response_snippet(response_text, 500)
897      ))
898   })?;
899
900   let stop_reason = value
901      .get("stop_reason")
902      .and_then(|v| v.as_str())
903      .map(str::to_string);
904
905   let mut tool_input: Option<serde_json::Value> = None;
906   let mut text_parts = Vec::new();
907
908   if let Some(content) = value.get("content").and_then(|v| v.as_array()) {
909      for item in content {
910         let item_type = item.get("type").and_then(|v| v.as_str()).unwrap_or("");
911         match item_type {
912            "tool_use" => {
913               let name = item.get("name").and_then(|v| v.as_str()).unwrap_or("");
914               if name == tool_name
915                  && let Some(input) = item.get("input")
916               {
917                  tool_input = Some(input.clone());
918               }
919            },
920            "text" => {
921               if let Some(text) = item.get("text").and_then(|v| v.as_str()) {
922                  text_parts.push(text.to_string());
923               }
924            },
925            _ => {},
926         }
927      }
928   }
929
930   Ok((tool_input, text_parts.join("\n"), stop_reason))
931}
932
933#[derive(Debug, Serialize)]
934struct Message {
935   role:    String,
936   content: String,
937}
938
939#[derive(Debug, Serialize, Deserialize)]
940struct FunctionParameters {
941   #[serde(rename = "type")]
942   param_type: String,
943   properties: serde_json::Value,
944   required:   Vec<String>,
945}
946
947#[derive(Debug, Serialize, Deserialize)]
948struct Function {
949   name:        String,
950   description: String,
951   parameters:  FunctionParameters,
952}
953
954#[derive(Debug, Serialize, Deserialize)]
955struct Tool {
956   #[serde(rename = "type")]
957   tool_type: String,
958   function:  Function,
959}
960
961#[derive(Debug, Serialize)]
962struct ApiRequest {
963   model:            String,
964   max_tokens:       u32,
965   temperature:      f32,
966   tools:            Vec<Tool>,
967   #[serde(skip_serializing_if = "Option::is_none")]
968   tool_choice:      Option<serde_json::Value>,
969   #[serde(skip_serializing_if = "Option::is_none")]
970   prompt_cache_key: Option<String>,
971   messages:         Vec<Message>,
972}
973
974#[derive(Debug, Serialize)]
975struct AnthropicRequest {
976   model:       String,
977   max_tokens:  u32,
978   temperature: f32,
979   #[serde(skip_serializing_if = "Option::is_none")]
980   system:      Option<Vec<AnthropicContent>>,
981   tools:       Vec<AnthropicTool>,
982   #[serde(skip_serializing_if = "Option::is_none")]
983   tool_choice: Option<AnthropicToolChoice>,
984   messages:    Vec<AnthropicMessage>,
985}
986
987#[derive(Debug, Clone, Serialize)]
988struct PromptCacheControl {
989   #[serde(rename = "type")]
990   control_type: String,
991}
992
993#[derive(Debug, Serialize)]
994struct AnthropicTool {
995   name:          String,
996   description:   String,
997   input_schema:  serde_json::Value,
998   #[serde(skip_serializing_if = "Option::is_none")]
999   cache_control: Option<PromptCacheControl>,
1000}
1001
1002#[derive(Debug, Serialize)]
1003struct AnthropicToolChoice {
1004   #[serde(rename = "type")]
1005   choice_type: String,
1006   name:        String,
1007}
1008
1009#[derive(Debug, Serialize)]
1010struct AnthropicMessage {
1011   role:    String,
1012   content: Vec<AnthropicContent>,
1013}
1014
1015#[derive(Debug, Clone, Serialize)]
1016struct AnthropicContent {
1017   #[serde(rename = "type")]
1018   content_type:  String,
1019   text:          String,
1020   #[serde(skip_serializing_if = "Option::is_none")]
1021   cache_control: Option<PromptCacheControl>,
1022}
1023
1024#[derive(Debug, Deserialize)]
1025struct ToolCall {
1026   function: FunctionCall,
1027}
1028
1029#[derive(Debug, Deserialize)]
1030struct FunctionCall {
1031   name:      String,
1032   arguments: String,
1033}
1034
1035#[derive(Debug, Deserialize)]
1036struct Choice {
1037   message: ResponseMessage,
1038}
1039
1040#[derive(Debug, Deserialize)]
1041struct ResponseMessage {
1042   #[serde(default)]
1043   tool_calls: Vec<ToolCall>,
1044   #[serde(default)]
1045   content:    Option<String>,
1046}
1047
1048#[derive(Debug, Deserialize)]
1049struct ApiResponse {
1050   choices: Vec<Choice>,
1051}
1052
1053#[derive(Debug, Deserialize)]
1054struct FileObservationResponse {
1055   #[serde(deserialize_with = "deserialize_observations")]
1056   observations: Vec<String>,
1057}
1058
1059/// Deserialize observations flexibly: handles array, stringified array, or
1060/// bullet string
1061fn deserialize_observations<'de, D>(deserializer: D) -> std::result::Result<Vec<String>, D::Error>
1062where
1063   D: serde::Deserializer<'de>,
1064{
1065   use std::fmt;
1066
1067   use serde::de::{self, Visitor};
1068
1069   struct ObservationsVisitor;
1070
1071   impl<'de> Visitor<'de> for ObservationsVisitor {
1072      type Value = Vec<String>;
1073
1074      fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
1075         formatter.write_str("an array of strings, a JSON array string, or a bullet-point string")
1076      }
1077
1078      fn visit_seq<A>(self, mut seq: A) -> std::result::Result<Self::Value, A::Error>
1079      where
1080         A: de::SeqAccess<'de>,
1081      {
1082         let mut vec = Vec::new();
1083         while let Some(item) = seq.next_element::<String>()? {
1084            vec.push(item);
1085         }
1086         Ok(vec)
1087      }
1088
1089      fn visit_str<E>(self, s: &str) -> std::result::Result<Self::Value, E>
1090      where
1091         E: de::Error,
1092      {
1093         Ok(parse_string_to_observations(s))
1094      }
1095   }
1096
1097   deserializer.deserialize_any(ObservationsVisitor)
1098}
1099
1100/// Parse a string into observations: handles JSON array string or bullet-point
1101/// string
1102fn parse_string_to_observations(s: &str) -> Vec<String> {
1103   let trimmed = s.trim();
1104   if trimmed.is_empty() {
1105      return Vec::new();
1106   }
1107
1108   // Try parsing as JSON array first
1109   if trimmed.starts_with('[')
1110      && let Ok(arr) = serde_json::from_str::<Vec<String>>(trimmed)
1111   {
1112      return arr;
1113   }
1114
1115   // Fall back to bullet-point parsing
1116   trimmed
1117      .lines()
1118      .map(str::trim)
1119      .filter(|line| !line.is_empty())
1120      .map(|line| {
1121         line
1122            .strip_prefix("- ")
1123            .or_else(|| line.strip_prefix("* "))
1124            .or_else(|| line.strip_prefix("• "))
1125            .unwrap_or(line)
1126            .trim()
1127            .to_string()
1128      })
1129      .filter(|line| !line.is_empty())
1130      .collect()
1131}
1132
1133fn build_observation_tool() -> Tool {
1134   Tool {
1135      tool_type: "function".to_string(),
1136      function:  Function {
1137         name:        "create_file_observation".to_string(),
1138         description: "Extract observations from a single file's changes".to_string(),
1139         parameters:  FunctionParameters {
1140            param_type: "object".to_string(),
1141            properties: serde_json::json!({
1142               "observations": {
1143                  "type": "array",
1144                  "description": "List of factual observations about what changed in this file",
1145                  "items": {
1146                     "type": "string"
1147                  }
1148               }
1149            }),
1150            required:   vec!["observations".to_string()],
1151         },
1152      },
1153   }
1154}
1155
1156fn build_analysis_tool(type_enum: &[&str]) -> Tool {
1157   Tool {
1158      tool_type: "function".to_string(),
1159      function:  Function {
1160         name:        "create_conventional_analysis".to_string(),
1161         description: "Synthesize observations into conventional commit analysis".to_string(),
1162         parameters:  FunctionParameters {
1163            param_type: "object".to_string(),
1164            properties: serde_json::json!({
1165               "type": {
1166                  "type": "string",
1167                  "enum": type_enum,
1168                  "description": "Commit type based on combined changes"
1169               },
1170               "scope": {
1171                  "type": "string",
1172                  "description": "Optional scope (module/component). Omit if unclear or multi-component."
1173               },
1174               "details": {
1175                  "type": "array",
1176                  "description": "Array of 0-6 detail items with changelog metadata.",
1177                  "items": {
1178                     "type": "object",
1179                     "properties": {
1180                        "text": {
1181                           "type": "string",
1182                           "description": "Detail about change, starting with past-tense verb, ending with period"
1183                        },
1184                        "changelog_category": {
1185                           "type": "string",
1186                           "enum": ["Added", "Changed", "Fixed", "Deprecated", "Removed", "Security"],
1187                           "description": "Changelog category if user-visible. Omit for internal changes."
1188                        },
1189                        "user_visible": {
1190                           "type": "boolean",
1191                           "description": "True if this change affects users/API and should appear in changelog"
1192                        }
1193                     },
1194                     "required": ["text", "user_visible"]
1195                  }
1196               },
1197               "issue_refs": {
1198                  "type": "array",
1199                  "description": "Issue numbers from context (e.g., ['#123', '#456']). Empty if none.",
1200                  "items": {
1201                     "type": "string"
1202                  }
1203               }
1204            }),
1205            required:   vec!["type".to_string(), "details".to_string(), "issue_refs".to_string()],
1206         },
1207      },
1208   }
1209}
1210
1211#[allow(
1212   clippy::too_many_arguments,
1213   reason = "needs model/tool/prompt metadata to build deterministic cacheable request"
1214)]
1215fn build_api_request(
1216   config: &CommitConfig,
1217   model: &str,
1218   temperature: f32,
1219   tools: Vec<Tool>,
1220   system: &str,
1221   user: &str,
1222   prompt_family: &str,
1223   prompt_variant: &str,
1224) -> ApiRequest {
1225   let tool_name = tools.first().map(|t| t.function.name.clone());
1226
1227   let mut messages = Vec::new();
1228   if !system.is_empty() {
1229      messages.push(Message { role: "system".to_string(), content: system.to_string() });
1230   }
1231   messages.push(Message { role: "user".to_string(), content: user.to_string() });
1232
1233   let prompt_cache_key =
1234      crate::api::openai_prompt_cache_key(config, model, prompt_family, prompt_variant, system);
1235
1236   ApiRequest {
1237      model: model.to_string(),
1238      max_tokens: 1500,
1239      temperature,
1240      tool_choice: tool_name
1241         .map(|name| serde_json::json!({ "type": "function", "function": { "name": name } })),
1242      prompt_cache_key,
1243      tools,
1244      messages,
1245   }
1246}
1247
1248#[cfg(test)]
1249mod tests {
1250   use super::*;
1251   use crate::tokens::TokenCounter;
1252
1253   fn test_counter() -> TokenCounter {
1254      TokenCounter::new("http://localhost:4000", None, "claude-sonnet-4.5")
1255   }
1256
1257   #[test]
1258   fn test_should_use_map_reduce_disabled() {
1259      let config = CommitConfig { map_reduce_enabled: false, ..Default::default() };
1260      let counter = test_counter();
1261      // Even with many files, disabled means no map-reduce
1262      let diff = r"diff --git a/a.rs b/a.rs
1263@@ -0,0 +1 @@
1264+a
1265diff --git a/b.rs b/b.rs
1266@@ -0,0 +1 @@
1267+b
1268diff --git a/c.rs b/c.rs
1269@@ -0,0 +1 @@
1270+c
1271diff --git a/d.rs b/d.rs
1272@@ -0,0 +1 @@
1273+d";
1274      assert!(!should_use_map_reduce(diff, &config, &counter));
1275   }
1276
1277   #[test]
1278   fn test_should_use_map_reduce_few_files() {
1279      let config = CommitConfig::default();
1280      let counter = test_counter();
1281      // Only 2 files - below threshold
1282      let diff = r"diff --git a/a.rs b/a.rs
1283@@ -0,0 +1 @@
1284+a
1285diff --git a/b.rs b/b.rs
1286@@ -0,0 +1 @@
1287+b";
1288      assert!(!should_use_map_reduce(diff, &config, &counter));
1289   }
1290
1291   #[test]
1292   fn test_should_use_map_reduce_many_files() {
1293      let config = CommitConfig::default();
1294      let counter = test_counter();
1295      // 5 files - above threshold
1296      let diff = r"diff --git a/a.rs b/a.rs
1297@@ -0,0 +1 @@
1298+a
1299diff --git a/b.rs b/b.rs
1300@@ -0,0 +1 @@
1301+b
1302diff --git a/c.rs b/c.rs
1303@@ -0,0 +1 @@
1304+c
1305diff --git a/d.rs d/d.rs
1306@@ -0,0 +1 @@
1307+d
1308diff --git a/e.rs b/e.rs
1309@@ -0,0 +1 @@
1310+e";
1311      assert!(should_use_map_reduce(diff, &config, &counter));
1312   }
1313
1314   #[test]
1315   fn test_generate_context_header_empty() {
1316      let files = vec![FileDiff {
1317         filename:  "only.rs".to_string(),
1318         header:    String::new(),
1319         content:   String::new(),
1320         additions: 10,
1321         deletions: 5,
1322         is_binary: false,
1323      }];
1324      let header = generate_context_header(&files, "only.rs");
1325      assert!(header.is_empty());
1326   }
1327
1328   #[test]
1329   fn test_generate_context_header_multiple() {
1330      let files = vec![
1331         FileDiff {
1332            filename:  "src/main.rs".to_string(),
1333            header:    String::new(),
1334            content:   "fn main() {}".to_string(),
1335            additions: 10,
1336            deletions: 5,
1337            is_binary: false,
1338         },
1339         FileDiff {
1340            filename:  "src/lib.rs".to_string(),
1341            header:    String::new(),
1342            content:   "mod test;".to_string(),
1343            additions: 3,
1344            deletions: 1,
1345            is_binary: false,
1346         },
1347         FileDiff {
1348            filename:  "tests/test.rs".to_string(),
1349            header:    String::new(),
1350            content:   "#[test]".to_string(),
1351            additions: 20,
1352            deletions: 0,
1353            is_binary: false,
1354         },
1355      ];
1356
1357      let header = generate_context_header(&files, "src/main.rs");
1358      assert!(header.contains("OTHER FILES IN THIS CHANGE:"));
1359      assert!(header.contains("src/lib.rs"));
1360      assert!(header.contains("tests/test.rs"));
1361      assert!(!header.contains("src/main.rs")); // Current file excluded
1362   }
1363
1364   #[test]
1365   fn test_infer_file_description() {
1366      assert_eq!(infer_file_description("src/test_utils.rs", ""), "test file");
1367      assert_eq!(infer_file_description("README.md", ""), "documentation");
1368      assert_eq!(infer_file_description("config.toml", ""), "configuration");
1369      assert_eq!(infer_file_description("src/error.rs", ""), "error definitions");
1370      assert_eq!(infer_file_description("src/types.rs", ""), "type definitions");
1371      assert_eq!(infer_file_description("src/mod.rs", ""), "module exports");
1372      assert_eq!(infer_file_description("src/main.rs", ""), "entry point");
1373      assert_eq!(infer_file_description("src/api.rs", "fn call()"), "implementation");
1374      assert_eq!(infer_file_description("src/models.rs", "struct Foo"), "type definitions");
1375      assert_eq!(infer_file_description("src/unknown.xyz", ""), "source code");
1376   }
1377
1378   #[test]
1379   fn test_parse_string_to_observations_json_array() {
1380      let input = r#"["item one", "item two", "item three"]"#;
1381      let result = parse_string_to_observations(input);
1382      assert_eq!(result, vec!["item one", "item two", "item three"]);
1383   }
1384
1385   #[test]
1386   fn test_parse_string_to_observations_bullet_points() {
1387      let input = "- added new function\n- fixed bug in parser\n- updated tests";
1388      let result = parse_string_to_observations(input);
1389      assert_eq!(result, vec!["added new function", "fixed bug in parser", "updated tests"]);
1390   }
1391
1392   #[test]
1393   fn test_parse_string_to_observations_asterisk_bullets() {
1394      let input = "* first change\n* second change";
1395      let result = parse_string_to_observations(input);
1396      assert_eq!(result, vec!["first change", "second change"]);
1397   }
1398
1399   #[test]
1400   fn test_parse_string_to_observations_empty() {
1401      assert!(parse_string_to_observations("").is_empty());
1402      assert!(parse_string_to_observations("   ").is_empty());
1403   }
1404
1405   #[test]
1406   fn test_deserialize_observations_array() {
1407      let json = r#"{"observations": ["a", "b", "c"]}"#;
1408      let result: FileObservationResponse = serde_json::from_str(json).unwrap();
1409      assert_eq!(result.observations, vec!["a", "b", "c"]);
1410   }
1411
1412   #[test]
1413   fn test_deserialize_observations_stringified_array() {
1414      let json = r#"{"observations": "[\"a\", \"b\", \"c\"]"}"#;
1415      let result: FileObservationResponse = serde_json::from_str(json).unwrap();
1416      assert_eq!(result.observations, vec!["a", "b", "c"]);
1417   }
1418
1419   #[test]
1420   fn test_deserialize_observations_bullet_string() {
1421      let json = r#"{"observations": "- updated function\n- fixed bug"}"#;
1422      let result: FileObservationResponse = serde_json::from_str(json).unwrap();
1423      assert_eq!(result.observations, vec!["updated function", "fixed bug"]);
1424   }
1425}
llm_git/map_reduce.rs

llm_git/
map_reduce.rs