use regex::Regex;
use crate::utils::token::estimate_tokens;
#[derive(Debug, Clone)]
pub struct FileDiff {
pub path: String,
pub content: String,
pub token_count: usize,
}
#[derive(Debug, Clone)]
struct DiffChunk {
content: String,
files: Vec<String>,
token_count: usize,
}
pub fn parse_diff_into_files(diff: &str) -> Vec<FileDiff> {
let mut files = Vec::new();
let mut current_file: Option<FileDiff> = None;
for line in diff.lines() {
if line.starts_with("+++ b/") {
if let Some(file) = current_file.take() {
files.push(file);
}
let path = line.strip_prefix("+++ b/").unwrap_or(line).to_string();
let tokens = estimate_tokens(&path).unwrap_or_default();
current_file = Some(FileDiff {
path,
content: String::new(),
token_count: tokens,
});
continue;
}
if line.starts_with("+++ /dev/null") && current_file.is_none() {
let path = extract_deleted_file_path(diff);
if let Some(p) = path.clone() {
let tokens = estimate_tokens(&p).unwrap_or_default();
current_file = Some(FileDiff {
path: p,
content: String::new(),
token_count: tokens,
});
}
continue;
}
if line.starts_with("--- a/") && current_file.is_none() {
let path = line.strip_prefix("--- a/").unwrap_or(line).to_string();
let tokens = estimate_tokens(&path).unwrap_or_default();
current_file = Some(FileDiff {
path,
content: String::new(),
token_count: tokens,
});
continue;
}
if let Some(ref mut file) = current_file {
file.content.push_str(line);
file.content.push('\n');
file.token_count += estimate_tokens(line).unwrap_or(1);
}
}
if let Some(file) = current_file {
files.push(file);
}
files
}
fn extract_deleted_file_path(diff: &str) -> Option<String> {
for line in diff.lines() {
if line.starts_with("--- a/") {
return Some(line.strip_prefix("--- a/").unwrap_or(line).to_string());
}
}
None
}
fn merge_diffs_into_chunks(files: &[FileDiff], max_tokens: usize) -> Vec<DiffChunk> {
let mut chunks = Vec::new();
let mut current_chunk = DiffChunk {
content: String::new(),
files: Vec::new(),
token_count: 0,
};
for file in files {
let header_overhead = estimate_tokens(&format!("diff --git a/{}", file.path)).unwrap_or(5);
let would_exceed = if current_chunk.content.is_empty() {
file.token_count > max_tokens
} else {
current_chunk.token_count + header_overhead + file.token_count > max_tokens
};
if would_exceed && !current_chunk.content.is_empty() {
chunks.push(current_chunk);
current_chunk = DiffChunk {
content: String::new(),
files: Vec::new(),
token_count: 0,
};
}
if !current_chunk.content.is_empty() {
current_chunk.content.push('\n');
}
current_chunk
.content
.push_str(&format!("diff --git a/{}\n", file.path));
current_chunk.content.push_str(&file.content);
current_chunk.files.push(file.path.clone());
current_chunk.token_count += header_overhead + file.token_count;
}
if !current_chunk.content.is_empty() {
chunks.push(current_chunk);
}
chunks
}
fn split_file_by_hunks(content: &str, max_tokens: usize) -> Vec<String> {
let mut hunks = Vec::new();
let mut current_hunk = String::new();
let mut current_tokens = 0;
let hunk_header_pattern = Regex::new(r"^@@ -\d+,\d+ \+\d+,\d+ @@").unwrap();
for line in content.lines() {
let line_tokens = estimate_tokens(line).unwrap_or(1) + 1;
if hunk_header_pattern.is_match(line) && !current_hunk.is_empty() {
hunks.push(current_hunk);
current_hunk = String::new();
current_tokens = 0;
}
if current_tokens + line_tokens > max_tokens && !current_hunk.is_empty() {
hunks.push(current_hunk);
current_hunk = String::new();
current_tokens = 0;
}
current_hunk.push_str(line);
current_hunk.push('\n');
current_tokens += line_tokens;
}
if !current_hunk.is_empty() {
hunks.push(current_hunk);
}
if hunks.is_empty() && !content.is_empty() {
hunks.push(content.to_string());
}
hunks
}
pub fn chunk_diff(diff: &str, max_tokens: usize) -> String {
let total_tokens = match estimate_tokens(diff) {
Ok(t) => t,
Err(_) => return diff.to_string(),
};
if total_tokens <= max_tokens {
return diff.to_string();
}
let files = parse_diff_into_files(diff);
let file_chunks = merge_diffs_into_chunks(&files, max_tokens);
if file_chunks.len() == 1 {
let chunk = &file_chunks[0];
if chunk.token_count > max_tokens {
let hunk_chunks = split_file_by_hunks(&chunk.content, max_tokens);
let total_hunks = hunk_chunks.len();
if total_hunks > 1 {
return hunk_chunks
.into_iter()
.enumerate()
.map(|(i, hunk)| {
format!(
"---CHUNK {} OF {}---\n{}\n---END CHUNK---",
i + 1,
total_hunks,
hunk.trim()
)
})
.collect::<Vec<_>>()
.join("\n\n");
}
}
return chunk.content.clone();
}
file_chunks
.into_iter()
.enumerate()
.map(|(i, chunk)| {
let file_list = chunk.files.join(", ");
format!(
"---CHUNK {} OF MULTIPLE FILES---\nFiles: {}\n\n{}\n---END CHUNK---",
i + 1,
file_list,
chunk.content.trim()
)
})
.collect::<Vec<_>>()
.join("\n\n")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_diff_into_files() {
let diff = "diff --git a/src/main.rs b/src/main.rs\n+++ b/src/main.rs\n@@ -1,3 +1,4 @@\n+use std::io;\n fn main() {\n }\n";
let files = parse_diff_into_files(diff);
assert_eq!(files.len(), 1);
assert_eq!(files[0].path, "src/main.rs");
assert!(files[0].content.contains("use std::io"));
}
#[test]
fn test_chunk_diff_small() {
let diff = "diff --git a/src/main.rs b/src/main.rs\n+++ b/src/main.rs\n fn main() {}\n";
let result = chunk_diff(diff, 1000);
assert_eq!(result, diff);
}
#[test]
fn test_parse_diff_header_only() {
let diff = "diff --git a/.gitignore b/.gitignore\nnew file mode 100644\n--- /dev/null\n+++ b/.gitignore\n@@ -0,0 +1 @@\n+*.tmp\n";
let files = parse_diff_into_files(diff);
assert_eq!(files.len(), 1);
assert_eq!(files[0].path, ".gitignore");
}
#[test]
fn test_parse_diff_deleted_file() {
let diff = "diff --git a/old.txt b/old.txt\ndeleted file mode 100644\n--- a/old.txt\n+++ /dev/null\n@@ -1 +0,0 @@\n-old content\n";
let files = parse_diff_into_files(diff);
assert_eq!(files.len(), 1);
assert_eq!(files[0].path, "old.txt");
}
}