Skip to main content

llm_git/
map_reduce.rs

1//! Map-reduce pattern for large diff analysis
2//!
3//! When diffs exceed the token threshold, this module splits analysis across
4//! files, then synthesizes results for accurate classification.
5
6use std::path::Path;
7
8use futures::stream::{self, StreamExt};
9use serde::{Deserialize, Serialize};
10
11use crate::{
12   api::retry_api_call,
13   config::{CommitConfig, ResolvedApiMode},
14   diff::{FileDiff, parse_diff, reconstruct_diff},
15   error::{CommitGenError, Result},
16   templates,
17   tokens::TokenCounter,
18   types::ConventionalAnalysis,
19};
20
21/// Observation from a single file during map phase
22#[derive(Debug, Clone, Serialize, Deserialize)]
23pub struct FileObservation {
24   pub file:         String,
25   pub observations: Vec<String>,
26   pub additions:    usize,
27   pub deletions:    usize,
28}
29
30/// Minimum files to justify map-reduce overhead (below this, unified is fine)
31const MIN_FILES_FOR_MAP_REDUCE: usize = 4;
32
33/// Maximum tokens per file in map phase (leave headroom for prompt template +
34/// context)
35const MAX_FILE_TOKENS: usize = 50_000;
36
37/// Check if map-reduce should be used
38/// Always use map-reduce except for:
39/// 1. Explicitly disabled in config
40/// 2. Very small diffs (≤3 files) where overhead isn't worth it
41pub fn should_use_map_reduce(diff: &str, config: &CommitConfig, counter: &TokenCounter) -> bool {
42   if !config.map_reduce_enabled {
43      return false;
44   }
45
46   let files = parse_diff(diff);
47   let file_count = files
48      .iter()
49      .filter(|f| {
50         !config
51            .excluded_files
52            .iter()
53            .any(|ex| f.filename.ends_with(ex))
54      })
55      .count();
56
57   // Use map-reduce for 4+ files, or if any single file would need truncation
58   file_count >= MIN_FILES_FOR_MAP_REDUCE
59      || files
60         .iter()
61         .any(|f| f.token_estimate(counter) > MAX_FILE_TOKENS)
62}
63
64/// Maximum files to include in context header (prevent token explosion)
65const MAX_CONTEXT_FILES: usize = 20;
66
67/// Generate context header summarizing other files for cross-file awareness
68fn generate_context_header(files: &[FileDiff], current_file: &str) -> String {
69   // Skip context header for very large commits (diminishing returns)
70   if files.len() > 100 {
71      return format!("(Large commit with {} total files)", files.len());
72   }
73
74   let mut lines = vec!["OTHER FILES IN THIS CHANGE:".to_string()];
75
76   let other_files: Vec<_> = files
77      .iter()
78      .filter(|f| f.filename != current_file)
79      .collect();
80
81   let total_other = other_files.len();
82
83   // Only show top files by change size if too many
84   let to_show: Vec<&FileDiff> = if total_other > MAX_CONTEXT_FILES {
85      let mut sorted = other_files;
86      sorted.sort_by_key(|f| std::cmp::Reverse(f.additions + f.deletions));
87      sorted.truncate(MAX_CONTEXT_FILES);
88      sorted
89   } else {
90      other_files
91   };
92
93   for file in &to_show {
94      let line_count = file.additions + file.deletions;
95      let description = infer_file_description(&file.filename, &file.content);
96      lines.push(format!("- {} ({} lines): {}", file.filename, line_count, description));
97   }
98
99   if to_show.len() < total_other {
100      lines.push(format!("... and {} more files", total_other - to_show.len()));
101   }
102
103   if lines.len() == 1 {
104      return String::new(); // No other files
105   }
106
107   lines.join("\n")
108}
109
110/// Infer a brief description of what a file likely contains based on
111/// name/content
112fn infer_file_description(filename: &str, content: &str) -> &'static str {
113   let filename_lower = filename.to_lowercase();
114
115   // Check filename patterns
116   if filename_lower.contains("test") {
117      return "test file";
118   }
119   if Path::new(filename)
120      .extension()
121      .is_some_and(|e| e.eq_ignore_ascii_case("md"))
122   {
123      return "documentation";
124   }
125   let ext = Path::new(filename).extension();
126   if filename_lower.contains("config")
127      || ext.is_some_and(|e| e.eq_ignore_ascii_case("toml"))
128      || ext.is_some_and(|e| e.eq_ignore_ascii_case("yaml"))
129      || ext.is_some_and(|e| e.eq_ignore_ascii_case("yml"))
130   {
131      return "configuration";
132   }
133   if filename_lower.contains("error") {
134      return "error definitions";
135   }
136   if filename_lower.contains("type") {
137      return "type definitions";
138   }
139   if filename_lower.ends_with("mod.rs") || filename_lower.ends_with("lib.rs") {
140      return "module exports";
141   }
142   if filename_lower.ends_with("main.rs")
143      || filename_lower.ends_with("main.go")
144      || filename_lower.ends_with("main.py")
145   {
146      return "entry point";
147   }
148
149   // Check content patterns
150   if content.contains("impl ") || content.contains("fn ") {
151      return "implementation";
152   }
153   if content.contains("struct ") || content.contains("enum ") {
154      return "type definitions";
155   }
156   if content.contains("async ") || content.contains("await") {
157      return "async code";
158   }
159
160   "source code"
161}
162
163/// Map phase: analyze each file individually and extract observations
164async fn map_phase(
165   files: &[FileDiff],
166   model_name: &str,
167   config: &CommitConfig,
168   counter: &TokenCounter,
169) -> Result<Vec<FileObservation>> {
170   // Process files concurrently using futures stream
171   let observations: Vec<Result<FileObservation>> = stream::iter(files.iter())
172      .map(|file| async {
173         if file.is_binary {
174            return Ok(FileObservation {
175               file:         file.filename.clone(),
176               observations: vec!["Binary file changed.".to_string()],
177               additions:    0,
178               deletions:    0,
179            });
180         }
181
182         let context_header = generate_context_header(files, &file.filename);
183         // Truncate large files to fit API limits
184         let mut file_clone = file.clone();
185         let file_tokens = file_clone.token_estimate(counter);
186         if file_tokens > MAX_FILE_TOKENS {
187            let target_size = MAX_FILE_TOKENS * 4; // Convert tokens to chars
188            file_clone.truncate(target_size);
189            eprintln!(
190               "  {} truncated {} ({} \u{2192} {} tokens)",
191               crate::style::icons::WARNING,
192               file.filename,
193               file_tokens,
194               file_clone.token_estimate(counter)
195            );
196         }
197
198         let file_diff = reconstruct_diff(&[file_clone]);
199
200         map_single_file(&file.filename, &file_diff, &context_header, model_name, config).await
201      })
202      .buffer_unordered(8)
203      .collect()
204      .await;
205
206   // Collect results, failing fast on first error
207   observations.into_iter().collect()
208}
209
210/// Analyze a single file and extract observations
211async fn map_single_file(
212   filename: &str,
213   file_diff: &str,
214   context_header: &str,
215   model_name: &str,
216   config: &CommitConfig,
217) -> Result<FileObservation> {
218   retry_api_call(config, async move || {
219      let client = crate::api::get_client(config);
220
221      let tool = build_observation_tool();
222
223      let parts = templates::render_map_prompt("default", filename, file_diff, context_header)?;
224      let mode = config.resolved_api_mode(model_name);
225
226      let response_text = match mode {
227         ResolvedApiMode::ChatCompletions => {
228            let request = build_api_request(
229               model_name,
230               config.temperature,
231               vec![tool],
232               &parts.system,
233               &parts.user,
234            );
235
236            let mut request_builder = client
237               .post(format!("{}/chat/completions", config.api_base_url))
238               .header("content-type", "application/json");
239
240            if let Some(api_key) = &config.api_key {
241               request_builder =
242                  request_builder.header("Authorization", format!("Bearer {api_key}"));
243            }
244
245            let (status, response_text) =
246               crate::api::timed_send(request_builder.json(&request), "map-reduce/map", model_name).await?;
247
248            if status.is_server_error() {
249               eprintln!(
250                  "{}",
251                  crate::style::error(&format!("Server error {status}: {response_text}"))
252               );
253               return Ok((true, None)); // Retry
254            }
255
256            if !status.is_success() {
257               return Err(CommitGenError::ApiError {
258                  status: status.as_u16(),
259                  body:   response_text,
260               });
261            }
262
263            response_text
264         },
265         ResolvedApiMode::AnthropicMessages => {
266            let request = AnthropicRequest {
267               model:       model_name.to_string(),
268               max_tokens:  1500,
269               temperature: config.temperature,
270               system:      if parts.system.is_empty() {
271                  None
272               } else {
273                  Some(parts.system.clone())
274               },
275               tools:       vec![AnthropicTool {
276                  name:         "create_file_observation".to_string(),
277                  description:  "Extract observations from a single file's changes".to_string(),
278                  input_schema: serde_json::json!({
279                     "type": "object",
280                     "properties": {
281                        "observations": {
282                           "type": "array",
283                           "description": "List of factual observations about what changed in this file",
284                           "items": {"type": "string"}
285                        }
286                     },
287                     "required": ["observations"]
288                  }),
289               }],
290               tool_choice: Some(AnthropicToolChoice {
291                  choice_type: "tool".to_string(),
292                  name:        "create_file_observation".to_string(),
293               }),
294               messages:    vec![AnthropicMessage {
295                  role:    "user".to_string(),
296                  content: vec![AnthropicContent {
297                     content_type: "text".to_string(),
298                     text:         parts.user,
299                  }],
300               }],
301            };
302
303            let mut request_builder = client
304               .post(anthropic_messages_url(&config.api_base_url))
305               .header("content-type", "application/json")
306               .header("anthropic-version", "2023-06-01");
307
308            if let Some(api_key) = &config.api_key {
309               request_builder = request_builder.header("x-api-key", api_key);
310            }
311
312            let (status, response_text) =
313               crate::api::timed_send(request_builder.json(&request), "map-reduce/map", model_name).await?;
314
315            if status.is_server_error() {
316               eprintln!(
317                  "{}",
318                  crate::style::error(&format!("Server error {status}: {response_text}"))
319               );
320               return Ok((true, None)); // Retry
321            }
322
323            if !status.is_success() {
324               return Err(CommitGenError::ApiError {
325                  status: status.as_u16(),
326                  body:   response_text,
327               });
328            }
329
330            response_text
331         },
332      };
333
334      if response_text.trim().is_empty() {
335         crate::style::warn("Model returned empty response body for observation; retrying.");
336         return Ok((true, None));
337      }
338
339      match mode {
340         ResolvedApiMode::ChatCompletions => {
341            let api_response: ApiResponse = serde_json::from_str(&response_text).map_err(|e| {
342               CommitGenError::Other(format!(
343                  "Failed to parse observation response JSON: {e}. Response body: {}",
344                  response_snippet(&response_text, 500)
345               ))
346            })?;
347
348            if api_response.choices.is_empty() {
349               return Err(CommitGenError::Other(
350                  "API returned empty response for file observation".to_string(),
351               ));
352            }
353
354            let message = &api_response.choices[0].message;
355
356            if !message.tool_calls.is_empty() {
357               let tool_call = &message.tool_calls[0];
358               if tool_call.function.name.ends_with("create_file_observation") {
359                  let args = &tool_call.function.arguments;
360                  if args.is_empty() {
361                     return Err(CommitGenError::Other(
362                        "Model returned empty function arguments for observation".to_string(),
363                     ));
364                  }
365
366                  let obs: FileObservationResponse = serde_json::from_str(args).map_err(|e| {
367                     CommitGenError::Other(format!("Failed to parse observation response: {e}"))
368                  })?;
369
370                  return Ok((
371                     false,
372                     Some(FileObservation {
373                        file:         filename.to_string(),
374                        observations: obs.observations,
375                        additions:    0, // Will be filled from FileDiff
376                        deletions:    0,
377                     }),
378                  ));
379               }
380            }
381
382            // Fallback: try to parse content
383            if let Some(content) = &message.content {
384               if content.trim().is_empty() {
385                  crate::style::warn("Model returned empty content for observation; retrying.");
386                  return Ok((true, None));
387               }
388               let obs: FileObservationResponse =
389                  serde_json::from_str(content.trim()).map_err(|e| {
390                     CommitGenError::Other(format!(
391                        "Failed to parse observation content JSON: {e}. Content: {}",
392                        response_snippet(content, 500)
393                     ))
394                  })?;
395               return Ok((
396                  false,
397                  Some(FileObservation {
398                     file:         filename.to_string(),
399                     observations: obs.observations,
400                     additions:    0,
401                     deletions:    0,
402                  }),
403               ));
404            }
405
406            Err(CommitGenError::Other("No observation found in API response".to_string()))
407         },
408         ResolvedApiMode::AnthropicMessages => {
409            let (tool_input, text_content, stop_reason) =
410               extract_anthropic_content(&response_text, "create_file_observation")?;
411
412            if let Some(input) = tool_input {
413               let mut observations = match input.get("observations") {
414                  Some(serde_json::Value::Array(arr)) => arr
415                     .iter()
416                     .filter_map(|v| v.as_str().map(str::to_string))
417                     .collect::<Vec<_>>(),
418                  Some(serde_json::Value::String(s)) => parse_string_to_observations(s),
419                  _ => Vec::new(),
420               };
421
422               if observations.is_empty() {
423                  let text_observations = parse_observations_from_text(&text_content);
424                  if !text_observations.is_empty() {
425                     observations = text_observations;
426                  } else if stop_reason.as_deref() == Some("max_tokens") {
427                     crate::style::warn(
428                        "Anthropic stopped at max_tokens with empty observations; using fallback \
429                         observation.",
430                     );
431                     let fallback_target = Path::new(filename)
432                        .file_name()
433                        .and_then(|name| name.to_str())
434                        .unwrap_or(filename);
435                     observations = vec![format!("Updated {fallback_target}.")];
436                  } else {
437                     crate::style::warn(
438                        "Model returned empty observation tool input; continuing with no \
439                         observations.",
440                     );
441                  }
442               }
443
444               return Ok((
445                  false,
446                  Some(FileObservation {
447                     file: filename.to_string(),
448                     observations,
449                     additions: 0,
450                     deletions: 0,
451                  }),
452               ));
453            }
454
455            if text_content.trim().is_empty() {
456               crate::style::warn("Model returned empty content for observation; retrying.");
457               return Ok((true, None));
458            }
459
460            let obs: FileObservationResponse =
461               serde_json::from_str(text_content.trim()).map_err(|e| {
462                  CommitGenError::Other(format!(
463                     "Failed to parse observation content JSON: {e}. Content: {}",
464                     response_snippet(&text_content, 500)
465                  ))
466               })?;
467            Ok((
468               false,
469               Some(FileObservation {
470                  file:         filename.to_string(),
471                  observations: obs.observations,
472                  additions:    0,
473                  deletions:    0,
474               }),
475            ))
476         },
477      }
478   }).await
479}
480
481/// Reduce phase: synthesize all observations into final analysis
482pub async fn reduce_phase(
483   observations: &[FileObservation],
484   stat: &str,
485   scope_candidates: &str,
486   model_name: &str,
487   config: &CommitConfig,
488) -> Result<ConventionalAnalysis> {
489   retry_api_call(config, async move || {
490      let client = crate::api::get_client(config);
491
492      // Build type enum from config
493      let type_enum: Vec<&str> = config.types.keys().map(|s| s.as_str()).collect();
494
495      let tool = build_analysis_tool(&type_enum);
496
497      let observations_json =
498         serde_json::to_string_pretty(observations).unwrap_or_else(|_| "[]".to_string());
499
500      let types_description = crate::api::format_types_description(config);
501      let parts = templates::render_reduce_prompt(
502         "default",
503         &observations_json,
504         stat,
505         scope_candidates,
506         Some(&types_description),
507      )?;
508      let mode = config.resolved_api_mode(model_name);
509
510      let response_text = match mode {
511         ResolvedApiMode::ChatCompletions => {
512            let request = build_api_request(
513               model_name,
514               config.temperature,
515               vec![tool],
516               &parts.system,
517               &parts.user,
518            );
519
520            let mut request_builder = client
521               .post(format!("{}/chat/completions", config.api_base_url))
522               .header("content-type", "application/json");
523
524            if let Some(api_key) = &config.api_key {
525               request_builder =
526                  request_builder.header("Authorization", format!("Bearer {api_key}"));
527            }
528
529            let (status, response_text) =
530               crate::api::timed_send(request_builder.json(&request), "map-reduce/reduce", model_name).await?;
531
532            if status.is_server_error() {
533               eprintln!(
534                  "{}",
535                  crate::style::error(&format!("Server error {status}: {response_text}"))
536               );
537               return Ok((true, None)); // Retry
538            }
539
540            if !status.is_success() {
541               return Err(CommitGenError::ApiError {
542                  status: status.as_u16(),
543                  body:   response_text,
544               });
545            }
546
547            response_text
548         },
549         ResolvedApiMode::AnthropicMessages => {
550            let request = AnthropicRequest {
551               model:       model_name.to_string(),
552               max_tokens:  1500,
553               temperature: config.temperature,
554               system:      if parts.system.is_empty() {
555                  None
556               } else {
557                  Some(parts.system.clone())
558               },
559               tools:       vec![AnthropicTool {
560                  name:         "create_conventional_analysis".to_string(),
561                  description:  "Analyze changes and classify as conventional commit with type, \
562                                 scope, details, and metadata"
563                     .to_string(),
564                  input_schema: serde_json::json!({
565                     "type": "object",
566                     "properties": {
567                        "type": {
568                           "type": "string",
569                           "enum": type_enum,
570                           "description": "Commit type based on change classification"
571                        },
572                        "scope": {
573                           "type": "string",
574                           "description": "Optional scope (module/component). Omit if unclear or multi-component."
575                        },
576                        "details": {
577                           "type": "array",
578                           "description": "Array of 0-6 detail items with changelog metadata.",
579                           "items": {
580                              "type": "object",
581                              "properties": {
582                                 "text": {
583                                    "type": "string",
584                                    "description": "Detail about change, starting with past-tense verb, ending with period"
585                                 },
586                                 "changelog_category": {
587                                    "type": "string",
588                                    "enum": ["Added", "Changed", "Fixed", "Deprecated", "Removed", "Security"],
589                                    "description": "Changelog category if user-visible. Omit for internal changes."
590                                 },
591                                 "user_visible": {
592                                    "type": "boolean",
593                                    "description": "True if this change affects users/API and should appear in changelog"
594                                 }
595                              },
596                              "required": ["text", "user_visible"]
597                           }
598                        },
599                        "issue_refs": {
600                           "type": "array",
601                           "description": "Issue numbers from context (e.g., ['#123', '#456']). Empty if none.",
602                           "items": {
603                              "type": "string"
604                           }
605                        }
606                     },
607                     "required": ["type", "details", "issue_refs"]
608                  }),
609               }],
610               tool_choice: Some(AnthropicToolChoice {
611                  choice_type: "tool".to_string(),
612                  name:        "create_conventional_analysis".to_string(),
613               }),
614               messages:    vec![AnthropicMessage {
615                  role:    "user".to_string(),
616                  content: vec![AnthropicContent {
617                     content_type: "text".to_string(),
618                     text:         parts.user,
619                  }],
620               }],
621            };
622
623            let mut request_builder = client
624               .post(anthropic_messages_url(&config.api_base_url))
625               .header("content-type", "application/json")
626               .header("anthropic-version", "2023-06-01");
627
628            if let Some(api_key) = &config.api_key {
629               request_builder = request_builder.header("x-api-key", api_key);
630            }
631
632            let (status, response_text) =
633               crate::api::timed_send(request_builder.json(&request), "map-reduce/reduce", model_name).await?;
634
635            if status.is_server_error() {
636               eprintln!(
637                  "{}",
638                  crate::style::error(&format!("Server error {status}: {response_text}"))
639               );
640               return Ok((true, None));
641            }
642
643            if !status.is_success() {
644               return Err(CommitGenError::ApiError {
645                  status: status.as_u16(),
646                  body:   response_text,
647               });
648            }
649
650            response_text
651         },
652      };
653
654      if response_text.trim().is_empty() {
655         crate::style::warn("Model returned empty response body for synthesis; retrying.");
656         return Ok((true, None));
657      }
658
659      match mode {
660         ResolvedApiMode::ChatCompletions => {
661            let api_response: ApiResponse = serde_json::from_str(&response_text).map_err(|e| {
662               CommitGenError::Other(format!(
663                  "Failed to parse synthesis response JSON: {e}. Response body: {}",
664                  response_snippet(&response_text, 500)
665               ))
666            })?;
667
668            if api_response.choices.is_empty() {
669               return Err(CommitGenError::Other(
670                  "API returned empty response for synthesis".to_string(),
671               ));
672            }
673
674            let message = &api_response.choices[0].message;
675
676            if !message.tool_calls.is_empty() {
677               let tool_call = &message.tool_calls[0];
678               if tool_call
679                  .function
680                  .name
681                  .ends_with("create_conventional_analysis")
682               {
683                  let args = &tool_call.function.arguments;
684                  if args.is_empty() {
685                     return Err(CommitGenError::Other(
686                        "Model returned empty function arguments for synthesis".to_string(),
687                     ));
688                  }
689
690                  let analysis: ConventionalAnalysis = serde_json::from_str(args).map_err(|e| {
691                     CommitGenError::Other(format!("Failed to parse synthesis response: {e}"))
692                  })?;
693
694                  return Ok((false, Some(analysis)));
695               }
696            }
697
698            // Fallback
699            if let Some(content) = &message.content {
700               if content.trim().is_empty() {
701                  crate::style::warn("Model returned empty content for synthesis; retrying.");
702                  return Ok((true, None));
703               }
704               let analysis: ConventionalAnalysis =
705                  serde_json::from_str(content.trim()).map_err(|e| {
706                     CommitGenError::Other(format!(
707                        "Failed to parse synthesis content JSON: {e}. Content: {}",
708                        response_snippet(content, 500)
709                     ))
710                  })?;
711               return Ok((false, Some(analysis)));
712            }
713
714            Err(CommitGenError::Other("No analysis found in synthesis response".to_string()))
715         },
716         ResolvedApiMode::AnthropicMessages => {
717            let (tool_input, text_content, stop_reason) =
718               extract_anthropic_content(&response_text, "create_conventional_analysis")?;
719
720            if let Some(input) = tool_input {
721               let analysis: ConventionalAnalysis = serde_json::from_value(input).map_err(|e| {
722                  CommitGenError::Other(format!(
723                     "Failed to parse synthesis tool input: {e}. Response body: {}",
724                     response_snippet(&response_text, 500)
725                  ))
726               })?;
727               return Ok((false, Some(analysis)));
728            }
729
730            if text_content.trim().is_empty() {
731               if stop_reason.as_deref() == Some("max_tokens") {
732                  crate::style::warn(
733                     "Anthropic stopped at max_tokens with empty synthesis; retrying.",
734                  );
735                  return Ok((true, None));
736               }
737               crate::style::warn("Model returned empty content for synthesis; retrying.");
738               return Ok((true, None));
739            }
740
741            let analysis: ConventionalAnalysis = serde_json::from_str(text_content.trim())
742               .map_err(|e| {
743                  CommitGenError::Other(format!(
744                     "Failed to parse synthesis content JSON: {e}. Content: {}",
745                     response_snippet(&text_content, 500)
746                  ))
747               })?;
748            Ok((false, Some(analysis)))
749         },
750      }
751   }).await
752}
753
754/// Run full map-reduce pipeline for large diffs
755pub async fn run_map_reduce(
756   diff: &str,
757   stat: &str,
758   scope_candidates: &str,
759   model_name: &str,
760   config: &CommitConfig,
761   counter: &TokenCounter,
762) -> Result<ConventionalAnalysis> {
763   let mut files = parse_diff(diff);
764
765   // Filter excluded files
766   files.retain(|f| {
767      !config
768         .excluded_files
769         .iter()
770         .any(|excluded| f.filename.ends_with(excluded))
771   });
772
773   if files.is_empty() {
774      return Err(CommitGenError::Other(
775         "No relevant files to analyze after filtering".to_string(),
776      ));
777   }
778
779   let file_count = files.len();
780   crate::style::print_info(&format!("Running map-reduce on {file_count} files..."));
781
782   // Map phase
783   let observations = map_phase(&files, model_name, config, counter).await?;
784
785   // Reduce phase
786   reduce_phase(&observations, stat, scope_candidates, model_name, config).await
787}
788
789// ============================================================================
790// API types (duplicated from api.rs to avoid circular deps)
791// ============================================================================
792
793fn response_snippet(body: &str, limit: usize) -> String {
794   if body.is_empty() {
795      return "<empty response body>".to_string();
796   }
797   let mut snippet = body.trim().to_string();
798   if snippet.len() > limit {
799      snippet.truncate(limit);
800      snippet.push_str("...");
801   }
802   snippet
803}
804
805fn parse_observations_from_text(text: &str) -> Vec<String> {
806   let trimmed = text.trim();
807   if trimmed.is_empty() {
808      return Vec::new();
809   }
810
811   if let Ok(obs) = serde_json::from_str::<FileObservationResponse>(trimmed) {
812      return obs.observations;
813   }
814
815   trimmed
816      .lines()
817      .map(str::trim)
818      .filter(|line| !line.is_empty())
819      .map(|line| {
820         line
821            .strip_prefix("- ")
822            .or_else(|| line.strip_prefix("* "))
823            .unwrap_or(line)
824            .trim()
825      })
826      .filter(|line| !line.is_empty())
827      .map(str::to_string)
828      .collect()
829}
830
831fn anthropic_messages_url(base_url: &str) -> String {
832   let trimmed = base_url.trim_end_matches('/');
833   if trimmed.ends_with("/v1") {
834      format!("{trimmed}/messages")
835   } else {
836      format!("{trimmed}/v1/messages")
837   }
838}
839
840fn extract_anthropic_content(
841   response_text: &str,
842   tool_name: &str,
843) -> Result<(Option<serde_json::Value>, String, Option<String>)> {
844   let value: serde_json::Value = serde_json::from_str(response_text).map_err(|e| {
845      CommitGenError::Other(format!(
846         "Failed to parse Anthropic response JSON: {e}. Response body: {}",
847         response_snippet(response_text, 500)
848      ))
849   })?;
850
851   let stop_reason = value
852      .get("stop_reason")
853      .and_then(|v| v.as_str())
854      .map(str::to_string);
855
856   let mut tool_input: Option<serde_json::Value> = None;
857   let mut text_parts = Vec::new();
858
859   if let Some(content) = value.get("content").and_then(|v| v.as_array()) {
860      for item in content {
861         let item_type = item.get("type").and_then(|v| v.as_str()).unwrap_or("");
862         match item_type {
863            "tool_use" => {
864               let name = item.get("name").and_then(|v| v.as_str()).unwrap_or("");
865               if name == tool_name
866                  && let Some(input) = item.get("input")
867               {
868                  tool_input = Some(input.clone());
869               }
870            },
871            "text" => {
872               if let Some(text) = item.get("text").and_then(|v| v.as_str()) {
873                  text_parts.push(text.to_string());
874               }
875            },
876            _ => {},
877         }
878      }
879   }
880
881   Ok((tool_input, text_parts.join("\n"), stop_reason))
882}
883
884#[derive(Debug, Serialize)]
885struct Message {
886   role:    String,
887   content: String,
888}
889
890#[derive(Debug, Serialize, Deserialize)]
891struct FunctionParameters {
892   #[serde(rename = "type")]
893   param_type: String,
894   properties: serde_json::Value,
895   required:   Vec<String>,
896}
897
898#[derive(Debug, Serialize, Deserialize)]
899struct Function {
900   name:        String,
901   description: String,
902   parameters:  FunctionParameters,
903}
904
905#[derive(Debug, Serialize, Deserialize)]
906struct Tool {
907   #[serde(rename = "type")]
908   tool_type: String,
909   function:  Function,
910}
911
912#[derive(Debug, Serialize)]
913struct ApiRequest {
914   model:       String,
915   max_tokens:  u32,
916   temperature: f32,
917   tools:       Vec<Tool>,
918   #[serde(skip_serializing_if = "Option::is_none")]
919   tool_choice: Option<serde_json::Value>,
920   messages:    Vec<Message>,
921}
922
923#[derive(Debug, Serialize)]
924struct AnthropicRequest {
925   model:       String,
926   max_tokens:  u32,
927   temperature: f32,
928   #[serde(skip_serializing_if = "Option::is_none")]
929   system:      Option<String>,
930   tools:       Vec<AnthropicTool>,
931   #[serde(skip_serializing_if = "Option::is_none")]
932   tool_choice: Option<AnthropicToolChoice>,
933   messages:    Vec<AnthropicMessage>,
934}
935
936#[derive(Debug, Serialize)]
937struct AnthropicTool {
938   name:         String,
939   description:  String,
940   input_schema: serde_json::Value,
941}
942
943#[derive(Debug, Serialize)]
944struct AnthropicToolChoice {
945   #[serde(rename = "type")]
946   choice_type: String,
947   name:        String,
948}
949
950#[derive(Debug, Serialize)]
951struct AnthropicMessage {
952   role:    String,
953   content: Vec<AnthropicContent>,
954}
955
956#[derive(Debug, Serialize)]
957struct AnthropicContent {
958   #[serde(rename = "type")]
959   content_type: String,
960   text:         String,
961}
962
963#[derive(Debug, Deserialize)]
964struct ToolCall {
965   function: FunctionCall,
966}
967
968#[derive(Debug, Deserialize)]
969struct FunctionCall {
970   name:      String,
971   arguments: String,
972}
973
974#[derive(Debug, Deserialize)]
975struct Choice {
976   message: ResponseMessage,
977}
978
979#[derive(Debug, Deserialize)]
980struct ResponseMessage {
981   #[serde(default)]
982   tool_calls: Vec<ToolCall>,
983   #[serde(default)]
984   content:    Option<String>,
985}
986
987#[derive(Debug, Deserialize)]
988struct ApiResponse {
989   choices: Vec<Choice>,
990}
991
992#[derive(Debug, Deserialize)]
993struct FileObservationResponse {
994   #[serde(deserialize_with = "deserialize_observations")]
995   observations: Vec<String>,
996}
997
998/// Deserialize observations flexibly: handles array, stringified array, or
999/// bullet string
1000fn deserialize_observations<'de, D>(deserializer: D) -> std::result::Result<Vec<String>, D::Error>
1001where
1002   D: serde::Deserializer<'de>,
1003{
1004   use std::fmt;
1005
1006   use serde::de::{self, Visitor};
1007
1008   struct ObservationsVisitor;
1009
1010   impl<'de> Visitor<'de> for ObservationsVisitor {
1011      type Value = Vec<String>;
1012
1013      fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
1014         formatter.write_str("an array of strings, a JSON array string, or a bullet-point string")
1015      }
1016
1017      fn visit_seq<A>(self, mut seq: A) -> std::result::Result<Self::Value, A::Error>
1018      where
1019         A: de::SeqAccess<'de>,
1020      {
1021         let mut vec = Vec::new();
1022         while let Some(item) = seq.next_element::<String>()? {
1023            vec.push(item);
1024         }
1025         Ok(vec)
1026      }
1027
1028      fn visit_str<E>(self, s: &str) -> std::result::Result<Self::Value, E>
1029      where
1030         E: de::Error,
1031      {
1032         Ok(parse_string_to_observations(s))
1033      }
1034   }
1035
1036   deserializer.deserialize_any(ObservationsVisitor)
1037}
1038
1039/// Parse a string into observations: handles JSON array string or bullet-point
1040/// string
1041fn parse_string_to_observations(s: &str) -> Vec<String> {
1042   let trimmed = s.trim();
1043   if trimmed.is_empty() {
1044      return Vec::new();
1045   }
1046
1047   // Try parsing as JSON array first
1048   if trimmed.starts_with('[')
1049      && let Ok(arr) = serde_json::from_str::<Vec<String>>(trimmed)
1050   {
1051      return arr;
1052   }
1053
1054   // Fall back to bullet-point parsing
1055   trimmed
1056      .lines()
1057      .map(str::trim)
1058      .filter(|line| !line.is_empty())
1059      .map(|line| {
1060         line
1061            .strip_prefix("- ")
1062            .or_else(|| line.strip_prefix("* "))
1063            .or_else(|| line.strip_prefix("• "))
1064            .unwrap_or(line)
1065            .trim()
1066            .to_string()
1067      })
1068      .filter(|line| !line.is_empty())
1069      .collect()
1070}
1071
1072fn build_observation_tool() -> Tool {
1073   Tool {
1074      tool_type: "function".to_string(),
1075      function:  Function {
1076         name:        "create_file_observation".to_string(),
1077         description: "Extract observations from a single file's changes".to_string(),
1078         parameters:  FunctionParameters {
1079            param_type: "object".to_string(),
1080            properties: serde_json::json!({
1081               "observations": {
1082                  "type": "array",
1083                  "description": "List of factual observations about what changed in this file",
1084                  "items": {
1085                     "type": "string"
1086                  }
1087               }
1088            }),
1089            required:   vec!["observations".to_string()],
1090         },
1091      },
1092   }
1093}
1094
1095fn build_analysis_tool(type_enum: &[&str]) -> Tool {
1096   Tool {
1097      tool_type: "function".to_string(),
1098      function:  Function {
1099         name:        "create_conventional_analysis".to_string(),
1100         description: "Synthesize observations into conventional commit analysis".to_string(),
1101         parameters:  FunctionParameters {
1102            param_type: "object".to_string(),
1103            properties: serde_json::json!({
1104               "type": {
1105                  "type": "string",
1106                  "enum": type_enum,
1107                  "description": "Commit type based on combined changes"
1108               },
1109               "scope": {
1110                  "type": "string",
1111                  "description": "Optional scope (module/component). Omit if unclear or multi-component."
1112               },
1113               "details": {
1114                  "type": "array",
1115                  "description": "Array of 0-6 detail items with changelog metadata.",
1116                  "items": {
1117                     "type": "object",
1118                     "properties": {
1119                        "text": {
1120                           "type": "string",
1121                           "description": "Detail about change, starting with past-tense verb, ending with period"
1122                        },
1123                        "changelog_category": {
1124                           "type": "string",
1125                           "enum": ["Added", "Changed", "Fixed", "Deprecated", "Removed", "Security"],
1126                           "description": "Changelog category if user-visible. Omit for internal changes."
1127                        },
1128                        "user_visible": {
1129                           "type": "boolean",
1130                           "description": "True if this change affects users/API and should appear in changelog"
1131                        }
1132                     },
1133                     "required": ["text", "user_visible"]
1134                  }
1135               },
1136               "issue_refs": {
1137                  "type": "array",
1138                  "description": "Issue numbers from context (e.g., ['#123', '#456']). Empty if none.",
1139                  "items": {
1140                     "type": "string"
1141                  }
1142               }
1143            }),
1144            required:   vec!["type".to_string(), "details".to_string(), "issue_refs".to_string()],
1145         },
1146      },
1147   }
1148}
1149
1150fn build_api_request(
1151   model: &str,
1152   temperature: f32,
1153   tools: Vec<Tool>,
1154   system: &str,
1155   user: &str,
1156) -> ApiRequest {
1157   let tool_name = tools.first().map(|t| t.function.name.clone());
1158
1159   let mut messages = Vec::new();
1160   if !system.is_empty() {
1161      messages.push(Message { role: "system".to_string(), content: system.to_string() });
1162   }
1163   messages.push(Message { role: "user".to_string(), content: user.to_string() });
1164
1165   ApiRequest {
1166      model: model.to_string(),
1167      max_tokens: 1500,
1168      temperature,
1169      tool_choice: tool_name
1170         .map(|name| serde_json::json!({ "type": "function", "function": { "name": name } })),
1171      tools,
1172      messages,
1173   }
1174}
1175
1176#[cfg(test)]
1177mod tests {
1178   use super::*;
1179   use crate::tokens::TokenCounter;
1180
1181   fn test_counter() -> TokenCounter {
1182      TokenCounter::new("http://localhost:4000", None, "claude-sonnet-4.5")
1183   }
1184
1185   #[test]
1186   fn test_should_use_map_reduce_disabled() {
1187      let config = CommitConfig { map_reduce_enabled: false, ..Default::default() };
1188      let counter = test_counter();
1189      // Even with many files, disabled means no map-reduce
1190      let diff = r"diff --git a/a.rs b/a.rs
1191@@ -0,0 +1 @@
1192+a
1193diff --git a/b.rs b/b.rs
1194@@ -0,0 +1 @@
1195+b
1196diff --git a/c.rs b/c.rs
1197@@ -0,0 +1 @@
1198+c
1199diff --git a/d.rs b/d.rs
1200@@ -0,0 +1 @@
1201+d";
1202      assert!(!should_use_map_reduce(diff, &config, &counter));
1203   }
1204
1205   #[test]
1206   fn test_should_use_map_reduce_few_files() {
1207      let config = CommitConfig::default();
1208      let counter = test_counter();
1209      // Only 2 files - below threshold
1210      let diff = r"diff --git a/a.rs b/a.rs
1211@@ -0,0 +1 @@
1212+a
1213diff --git a/b.rs b/b.rs
1214@@ -0,0 +1 @@
1215+b";
1216      assert!(!should_use_map_reduce(diff, &config, &counter));
1217   }
1218
1219   #[test]
1220   fn test_should_use_map_reduce_many_files() {
1221      let config = CommitConfig::default();
1222      let counter = test_counter();
1223      // 5 files - above threshold
1224      let diff = r"diff --git a/a.rs b/a.rs
1225@@ -0,0 +1 @@
1226+a
1227diff --git a/b.rs b/b.rs
1228@@ -0,0 +1 @@
1229+b
1230diff --git a/c.rs b/c.rs
1231@@ -0,0 +1 @@
1232+c
1233diff --git a/d.rs d/d.rs
1234@@ -0,0 +1 @@
1235+d
1236diff --git a/e.rs b/e.rs
1237@@ -0,0 +1 @@
1238+e";
1239      assert!(should_use_map_reduce(diff, &config, &counter));
1240   }
1241
1242   #[test]
1243   fn test_generate_context_header_empty() {
1244      let files = vec![FileDiff {
1245         filename:  "only.rs".to_string(),
1246         header:    String::new(),
1247         content:   String::new(),
1248         additions: 10,
1249         deletions: 5,
1250         is_binary: false,
1251      }];
1252      let header = generate_context_header(&files, "only.rs");
1253      assert!(header.is_empty());
1254   }
1255
1256   #[test]
1257   fn test_generate_context_header_multiple() {
1258      let files = vec![
1259         FileDiff {
1260            filename:  "src/main.rs".to_string(),
1261            header:    String::new(),
1262            content:   "fn main() {}".to_string(),
1263            additions: 10,
1264            deletions: 5,
1265            is_binary: false,
1266         },
1267         FileDiff {
1268            filename:  "src/lib.rs".to_string(),
1269            header:    String::new(),
1270            content:   "mod test;".to_string(),
1271            additions: 3,
1272            deletions: 1,
1273            is_binary: false,
1274         },
1275         FileDiff {
1276            filename:  "tests/test.rs".to_string(),
1277            header:    String::new(),
1278            content:   "#[test]".to_string(),
1279            additions: 20,
1280            deletions: 0,
1281            is_binary: false,
1282         },
1283      ];
1284
1285      let header = generate_context_header(&files, "src/main.rs");
1286      assert!(header.contains("OTHER FILES IN THIS CHANGE:"));
1287      assert!(header.contains("src/lib.rs"));
1288      assert!(header.contains("tests/test.rs"));
1289      assert!(!header.contains("src/main.rs")); // Current file excluded
1290   }
1291
1292   #[test]
1293   fn test_infer_file_description() {
1294      assert_eq!(infer_file_description("src/test_utils.rs", ""), "test file");
1295      assert_eq!(infer_file_description("README.md", ""), "documentation");
1296      assert_eq!(infer_file_description("config.toml", ""), "configuration");
1297      assert_eq!(infer_file_description("src/error.rs", ""), "error definitions");
1298      assert_eq!(infer_file_description("src/types.rs", ""), "type definitions");
1299      assert_eq!(infer_file_description("src/mod.rs", ""), "module exports");
1300      assert_eq!(infer_file_description("src/main.rs", ""), "entry point");
1301      assert_eq!(infer_file_description("src/api.rs", "fn call()"), "implementation");
1302      assert_eq!(infer_file_description("src/models.rs", "struct Foo"), "type definitions");
1303      assert_eq!(infer_file_description("src/unknown.xyz", ""), "source code");
1304   }
1305
1306   #[test]
1307   fn test_parse_string_to_observations_json_array() {
1308      let input = r#"["item one", "item two", "item three"]"#;
1309      let result = parse_string_to_observations(input);
1310      assert_eq!(result, vec!["item one", "item two", "item three"]);
1311   }
1312
1313   #[test]
1314   fn test_parse_string_to_observations_bullet_points() {
1315      let input = "- added new function\n- fixed bug in parser\n- updated tests";
1316      let result = parse_string_to_observations(input);
1317      assert_eq!(result, vec!["added new function", "fixed bug in parser", "updated tests"]);
1318   }
1319
1320   #[test]
1321   fn test_parse_string_to_observations_asterisk_bullets() {
1322      let input = "* first change\n* second change";
1323      let result = parse_string_to_observations(input);
1324      assert_eq!(result, vec!["first change", "second change"]);
1325   }
1326
1327   #[test]
1328   fn test_parse_string_to_observations_empty() {
1329      assert!(parse_string_to_observations("").is_empty());
1330      assert!(parse_string_to_observations("   ").is_empty());
1331   }
1332
1333   #[test]
1334   fn test_deserialize_observations_array() {
1335      let json = r#"{"observations": ["a", "b", "c"]}"#;
1336      let result: FileObservationResponse = serde_json::from_str(json).unwrap();
1337      assert_eq!(result.observations, vec!["a", "b", "c"]);
1338   }
1339
1340   #[test]
1341   fn test_deserialize_observations_stringified_array() {
1342      let json = r#"{"observations": "[\"a\", \"b\", \"c\"]"}"#;
1343      let result: FileObservationResponse = serde_json::from_str(json).unwrap();
1344      assert_eq!(result.observations, vec!["a", "b", "c"]);
1345   }
1346
1347   #[test]
1348   fn test_deserialize_observations_bullet_string() {
1349      let json = r#"{"observations": "- updated function\n- fixed bug"}"#;
1350      let result: FileObservationResponse = serde_json::from_str(json).unwrap();
1351      assert_eq!(result.observations, vec!["updated function", "fixed bug"]);
1352   }
1353}