use tracing::debug;
use crate::{
config::mapreduce::MapReduceConfig,
pipeline::diff_analyzer::models::{FileDisposition, FilteredDiff, FilteredFile, FilteredHunk},
};
use super::unit::{MapUnit, MapUnitKind};
fn render_file(file: &FilteredFile) -> String {
match file.disposition {
FileDisposition::Kept => {
let header = format!("--- a/{0}\n+++ b/{0}\n", file.filename);
let hunks = file
.hunks
.iter()
.map(|h| format!("{}\n", h.render()))
.collect::<String>();
format!("{header}{hunks}")
}
FileDisposition::SummaryOnly => {
if let Some(ref s) = file.summary_line {
format!("# {}: {}\n", file.filename, s)
} else {
String::new()
}
}
FileDisposition::Dropped => String::new(),
}
}
fn render_file_hunks(path: &str, hunks: &[FilteredHunk]) -> String {
let header = format!("--- a/{path}\n+++ b/{path}\n");
let hunk_body = hunks
.iter()
.map(|h| format!("{}\n", h.render()))
.collect::<String>();
format!("{header}{hunk_body}")
}
fn is_metadata_only(file: &FilteredFile) -> bool {
if file.status == "removed" {
return true;
}
if file.disposition == FileDisposition::SummaryOnly {
return true;
}
if file.hunks.is_empty() {
return true;
}
false
}
fn metadata_note(file: &FilteredFile) -> String {
if file.status == "removed" {
return "deleted file".to_string();
}
if file.disposition == FileDisposition::SummaryOnly {
return "summary-only (fixture/generated)".to_string();
}
if file.hunks.is_empty() {
if file.status == "renamed" {
return "rename-only (no content change)".to_string();
}
return "binary file or empty diff".to_string();
}
"metadata-only".to_string()
}
fn sub_chunk_file(file: &FilteredFile, per_file_chars: usize) -> Vec<MapUnit> {
let header_len = format!("--- a/{0}\n+++ b/{0}\n", file.filename).len();
let rendered_hunks: Vec<(String, usize)> = file
.hunks
.iter()
.map(|h| {
let s = format!("{}\n", h.render());
let len = s.len();
(s, len)
})
.collect();
let mut chunks: Vec<Vec<usize>> = Vec::new(); let mut current_chunk: Vec<usize> = Vec::new();
let mut current_len: usize = header_len;
for (idx, (_, hunk_len)) in rendered_hunks.iter().enumerate() {
let would_exceed = current_len + hunk_len > per_file_chars;
if would_exceed && !current_chunk.is_empty() {
chunks.push(std::mem::take(&mut current_chunk));
current_len = header_len;
}
current_chunk.push(idx);
current_len += hunk_len;
}
if !current_chunk.is_empty() {
chunks.push(current_chunk);
}
let chunk_total = chunks.len().max(1);
chunks
.into_iter()
.enumerate()
.map(|(chunk_index, hunk_indices)| {
let hunks_slice: Vec<FilteredHunk> = hunk_indices
.iter()
.map(|&i| file.hunks[i].clone())
.collect();
let hunk_oversized = hunk_indices.len() == 1
&& (header_len + rendered_hunks[hunk_indices[0]].1) > per_file_chars;
let diff_text = render_file_hunks(&file.filename, &hunks_slice);
let diff_char_count = diff_text.len();
MapUnit {
file: file.filename.clone(),
status: file.status.clone(),
kind: MapUnitKind::Review { diff_text },
diff_char_count,
chunk_index,
chunk_total,
hunk_oversized,
}
})
.collect()
}
pub fn split_into_units(filtered: &FilteredDiff, config: &MapReduceConfig) -> Vec<MapUnit> {
let per_file_chars = config.per_file_chars;
let total_budget = config.total_char_budget;
let max_calls = config.max_calls;
let mut units: Vec<MapUnit> = Vec::with_capacity(filtered.files.len());
let mut total_chars_used: usize = 0;
let mut review_calls: usize = 0;
for file in &filtered.files {
if is_metadata_only(file) {
let note = metadata_note(file);
debug!(
file = %file.filename,
status = %file.status,
%note,
"splitter: metadata-only unit"
);
units.push(MapUnit {
file: file.filename.clone(),
status: file.status.clone(),
kind: MapUnitKind::MetadataOnly { note },
diff_char_count: 0,
chunk_index: 0,
chunk_total: 1,
hunk_oversized: false,
});
continue;
}
let rendered = render_file(file);
let rendered_len = rendered.len();
if rendered_len <= per_file_chars {
if review_calls >= max_calls {
debug!(
file = %file.filename,
review_calls,
"splitter: max_calls reached — downgrading to metadata-only"
);
units.push(MapUnit {
file: file.filename.clone(),
status: file.status.clone(),
kind: MapUnitKind::MetadataOnly {
note: "max-calls reached".to_string(),
},
diff_char_count: 0,
chunk_index: 0,
chunk_total: 1,
hunk_oversized: false,
});
continue;
}
if total_chars_used + rendered_len > total_budget {
debug!(
file = %file.filename,
total_chars_used,
rendered_len,
total_budget,
"splitter: total_char_budget exhausted — downgrading to metadata-only"
);
units.push(MapUnit {
file: file.filename.clone(),
status: file.status.clone(),
kind: MapUnitKind::MetadataOnly {
note: "budget exhausted".to_string(),
},
diff_char_count: 0,
chunk_index: 0,
chunk_total: 1,
hunk_oversized: false,
});
continue;
}
total_chars_used += rendered_len;
review_calls += 1;
debug!(
file = %file.filename,
rendered_len,
review_calls,
total_chars_used,
"splitter: single-unit review"
);
units.push(MapUnit {
file: file.filename.clone(),
status: file.status.clone(),
kind: MapUnitKind::Review {
diff_text: rendered,
},
diff_char_count: rendered_len,
chunk_index: 0,
chunk_total: 1,
hunk_oversized: false,
});
} else {
debug!(
file = %file.filename,
rendered_len,
per_file_chars,
"splitter: oversized file — sub-chunking by hunk"
);
let sub_chunks = sub_chunk_file(file, per_file_chars);
for mut chunk in sub_chunks {
if chunk.is_metadata_only() {
units.push(chunk);
continue;
}
if review_calls >= max_calls {
chunk.kind = MapUnitKind::MetadataOnly {
note: "max-calls reached".to_string(),
};
chunk.diff_char_count = 0;
units.push(chunk);
continue;
}
if total_chars_used + chunk.diff_char_count > total_budget {
chunk.kind = MapUnitKind::MetadataOnly {
note: "budget exhausted".to_string(),
};
chunk.diff_char_count = 0;
units.push(chunk);
continue;
}
total_chars_used += chunk.diff_char_count;
review_calls += 1;
units.push(chunk);
}
}
}
debug!(
total_units = units.len(),
review_units = review_calls,
total_chars_used,
"splitter: done"
);
units
}
#[cfg(test)]
#[path = "splitter_tests.rs"]
mod tests;
#[cfg(test)]
#[path = "splitter_budget_tests.rs"]
mod budget_tests;