rusty_commit/utils/
diff_chunking.rs1use regex::Regex;
10
11use crate::utils::token::estimate_tokens;
12
13#[derive(Debug, Clone)]
15pub struct FileDiff {
16 pub path: String,
18 pub content: String,
20 pub token_count: usize,
22}
23
24#[derive(Debug, Clone)]
26struct DiffChunk {
27 content: String,
29 files: Vec<String>,
31 token_count: usize,
33}
34
35pub fn parse_diff_into_files(diff: &str) -> Vec<FileDiff> {
39 let mut files = Vec::new();
40 let mut current_file: Option<FileDiff> = None;
41
42 for line in diff.lines() {
43 if line.starts_with("+++ b/") {
45 if let Some(file) = current_file.take() {
47 files.push(file);
48 }
49
50 let path = line.strip_prefix("+++ b/").unwrap_or(line).to_string();
51 let tokens = estimate_tokens(&path).unwrap_or_default();
52
53 current_file = Some(FileDiff {
54 path,
55 content: String::new(),
56 token_count: tokens,
57 });
58 continue;
59 }
60
61 if line.starts_with("+++ /dev/null") && current_file.is_none() {
63 let path = extract_deleted_file_path(diff);
65 if let Some(p) = path.clone() {
66 let tokens = estimate_tokens(&p).unwrap_or_default();
67 current_file = Some(FileDiff {
68 path: p,
69 content: String::new(),
70 token_count: tokens,
71 });
72 }
73 continue;
74 }
75
76 if line.starts_with("--- a/") && current_file.is_none() {
78 let path = line.strip_prefix("--- a/").unwrap_or(line).to_string();
79 let tokens = estimate_tokens(&path).unwrap_or_default();
80 current_file = Some(FileDiff {
82 path,
83 content: String::new(),
84 token_count: tokens,
85 });
86 continue;
87 }
88
89 if let Some(ref mut file) = current_file {
91 file.content.push_str(line);
92 file.content.push('\n');
93 file.token_count += estimate_tokens(line).unwrap_or(1);
94 }
95 }
96
97 if let Some(file) = current_file {
99 files.push(file);
100 }
101
102 files
103}
104
105fn extract_deleted_file_path(diff: &str) -> Option<String> {
107 for line in diff.lines() {
108 if line.starts_with("--- a/") {
109 return Some(line.strip_prefix("--- a/").unwrap_or(line).to_string());
110 }
111 }
112 None
113}
114
115fn merge_diffs_into_chunks(files: &[FileDiff], max_tokens: usize) -> Vec<DiffChunk> {
120 let mut chunks = Vec::new();
121 let mut current_chunk = DiffChunk {
122 content: String::new(),
123 files: Vec::new(),
124 token_count: 0,
125 };
126
127 for file in files {
128 let header_overhead = estimate_tokens(&format!("diff --git a/{}", file.path)).unwrap_or(5);
130
131 let would_exceed = if current_chunk.content.is_empty() {
132 file.token_count > max_tokens
133 } else {
134 current_chunk.token_count + header_overhead + file.token_count > max_tokens
135 };
136
137 if would_exceed && !current_chunk.content.is_empty() {
138 chunks.push(current_chunk);
140 current_chunk = DiffChunk {
141 content: String::new(),
142 files: Vec::new(),
143 token_count: 0,
144 };
145 }
146
147 if !current_chunk.content.is_empty() {
149 current_chunk.content.push('\n');
150 }
151 current_chunk
152 .content
153 .push_str(&format!("diff --git a/{}\n", file.path));
154 current_chunk.content.push_str(&file.content);
155 current_chunk.files.push(file.path.clone());
156 current_chunk.token_count += header_overhead + file.token_count;
157 }
158
159 if !current_chunk.content.is_empty() {
161 chunks.push(current_chunk);
162 }
163
164 chunks
165}
166
167fn split_file_by_hunks(content: &str, max_tokens: usize) -> Vec<String> {
172 let mut hunks = Vec::new();
173 let mut current_hunk = String::new();
174 let mut current_tokens = 0;
175
176 let hunk_header_pattern = Regex::new(r"^@@ -\d+,\d+ \+\d+,\d+ @@").unwrap();
178
179 for line in content.lines() {
180 let line_tokens = estimate_tokens(line).unwrap_or(1) + 1; if hunk_header_pattern.is_match(line) && !current_hunk.is_empty() {
184 hunks.push(current_hunk);
185 current_hunk = String::new();
186 current_tokens = 0;
187 }
188
189 if current_tokens + line_tokens > max_tokens && !current_hunk.is_empty() {
190 hunks.push(current_hunk);
191 current_hunk = String::new();
192 current_tokens = 0;
193 }
194
195 current_hunk.push_str(line);
196 current_hunk.push('\n');
197 current_tokens += line_tokens;
198 }
199
200 if !current_hunk.is_empty() {
201 hunks.push(current_hunk);
202 }
203
204 if hunks.is_empty() && !content.is_empty() {
206 hunks.push(content.to_string());
207 }
208
209 hunks
210}
211
212pub fn chunk_diff(diff: &str, max_tokens: usize) -> String {
228 let total_tokens = match estimate_tokens(diff) {
230 Ok(t) => t,
231 Err(_) => return diff.to_string(),
232 };
233
234 if total_tokens <= max_tokens {
235 return diff.to_string();
236 }
237
238 let files = parse_diff_into_files(diff);
240
241 let file_chunks = merge_diffs_into_chunks(&files, max_tokens);
243
244 if file_chunks.len() == 1 {
245 let chunk = &file_chunks[0];
247 if chunk.token_count > max_tokens {
248 let hunk_chunks = split_file_by_hunks(&chunk.content, max_tokens);
249 let total_hunks = hunk_chunks.len();
250 if total_hunks > 1 {
251 return hunk_chunks
252 .into_iter()
253 .enumerate()
254 .map(|(i, hunk)| {
255 format!(
256 "---CHUNK {} OF {}---\n{}\n---END CHUNK---",
257 i + 1,
258 total_hunks,
259 hunk.trim()
260 )
261 })
262 .collect::<Vec<_>>()
263 .join("\n\n");
264 }
265 }
266 return chunk.content.clone();
267 }
268
269 file_chunks
271 .into_iter()
272 .enumerate()
273 .map(|(i, chunk)| {
274 let file_list = chunk.files.join(", ");
275 format!(
276 "---CHUNK {} OF MULTIPLE FILES---\nFiles: {}\n\n{}\n---END CHUNK---",
277 i + 1,
278 file_list,
279 chunk.content.trim()
280 )
281 })
282 .collect::<Vec<_>>()
283 .join("\n\n")
284}
285
286#[cfg(test)]
287mod tests {
288 use super::*;
289
290 #[test]
291 fn test_parse_diff_into_files() {
292 let diff = "diff --git a/src/main.rs b/src/main.rs\n+++ b/src/main.rs\n@@ -1,3 +1,4 @@\n+use std::io;\n fn main() {\n }\n";
293 let files = parse_diff_into_files(diff);
294 assert_eq!(files.len(), 1);
295 assert_eq!(files[0].path, "src/main.rs");
296 assert!(files[0].content.contains("use std::io"));
297 }
298
299 #[test]
300 fn test_chunk_diff_small() {
301 let diff = "diff --git a/src/main.rs b/src/main.rs\n+++ b/src/main.rs\n fn main() {}\n";
302 let result = chunk_diff(diff, 1000);
303 assert_eq!(result, diff);
304 }
305
306 #[test]
307 fn test_parse_diff_header_only() {
308 let diff = "diff --git a/.gitignore b/.gitignore\nnew file mode 100644\n--- /dev/null\n+++ b/.gitignore\n@@ -0,0 +1 @@\n+*.tmp\n";
309 let files = parse_diff_into_files(diff);
310 assert_eq!(files.len(), 1);
311 assert_eq!(files[0].path, ".gitignore");
312 }
313
314 #[test]
315 fn test_parse_diff_deleted_file() {
316 let diff = "diff --git a/old.txt b/old.txt\ndeleted file mode 100644\n--- a/old.txt\n+++ /dev/null\n@@ -1 +0,0 @@\n-old content\n";
317 let files = parse_diff_into_files(diff);
318 assert_eq!(files.len(), 1);
319 assert_eq!(files[0].path, "old.txt");
320 }
321}