Skip to main content

rusty_commit/utils/
diff_chunking.rs

1//! Utilities for chunking large git diffs into token-safe pieces.
2//!
3//! This module provides multi-level diff chunking to handle large diffs that
4//! exceed model token limits:
5//! 1. **File-level merging**: Greedily combine entire file diffs until token limit
6//! 2. **Hunk-level splitting**: If a single file is too large, split by hunks
7//! 3. **Line-level splitting**: For extremely large hunks, split by lines
8
9use regex::Regex;
10
11use crate::utils::token::estimate_tokens;
12
13/// Represents a single file diff with its metadata
14#[derive(Debug, Clone)]
15pub struct FileDiff {
16    /// The file path (e.g., "src/main.rs")
17    pub path: String,
18    /// The raw diff content for this file
19    pub content: String,
20    /// Estimated token count of the content
21    pub token_count: usize,
22}
23
24/// Represents a chunk of diffs that can be sent to the AI
25#[derive(Debug, Clone)]
26struct DiffChunk {
27    /// The diff content for this chunk
28    content: String,
29    /// Files included in this chunk
30    files: Vec<String>,
31    /// Total token count
32    token_count: usize,
33}
34
35/// Parses a unified diff into individual file diffs.
36///
37/// Returns a vector of FileDiff, one per modified/new/deleted file.
38pub fn parse_diff_into_files(diff: &str) -> Vec<FileDiff> {
39    let mut files = Vec::new();
40    let mut current_file: Option<FileDiff> = None;
41
42    for line in diff.lines() {
43        // Check if this is a new file header
44        if line.starts_with("+++ b/") {
45            // Save previous file if exists
46            if let Some(file) = current_file.take() {
47                files.push(file);
48            }
49
50            let path = line.strip_prefix("+++ b/").unwrap_or(line).to_string();
51            let tokens = estimate_tokens(&path).unwrap_or_default();
52
53            current_file = Some(FileDiff {
54                path,
55                content: String::new(),
56                token_count: tokens,
57            });
58            continue;
59        }
60
61        // Check if this is a deleted file header (+++ /dev/null)
62        if line.starts_with("+++ /dev/null") && current_file.is_none() {
63            // This is a deleted file, parse the path from --- a/
64            let path = extract_deleted_file_path(diff);
65            if let Some(p) = path.clone() {
66                let tokens = estimate_tokens(&p).unwrap_or_default();
67                current_file = Some(FileDiff {
68                    path: p,
69                    content: String::new(),
70                    token_count: tokens,
71                });
72            }
73            continue;
74        }
75
76        // Check if this is a deleted file old path (we need to capture path before +++)
77        if line.starts_with("--- a/") && current_file.is_none() {
78            let path = line.strip_prefix("--- a/").unwrap_or(line).to_string();
79            let tokens = estimate_tokens(&path).unwrap_or_default();
80            // Create file - will be updated if +++ b/ has different path
81            current_file = Some(FileDiff {
82                path,
83                content: String::new(),
84                token_count: tokens,
85            });
86            continue;
87        }
88
89        // Add line to current file if we have one
90        if let Some(ref mut file) = current_file {
91            file.content.push_str(line);
92            file.content.push('\n');
93            file.token_count += estimate_tokens(line).unwrap_or(1);
94        }
95    }
96
97    // Save the last file
98    if let Some(file) = current_file {
99        files.push(file);
100    }
101
102    files
103}
104
105/// Extract the path from a deleted file diff (where +++ is /dev/null)
106fn extract_deleted_file_path(diff: &str) -> Option<String> {
107    for line in diff.lines() {
108        if line.starts_with("--- a/") {
109            return Some(line.strip_prefix("--- a/").unwrap_or(line).to_string());
110        }
111    }
112    None
113}
114
115/// Merges file diffs greedily until reaching the token limit.
116///
117/// This is the first level of chunking - it groups whole files together
118/// to maximize context while staying under the token limit.
119fn merge_diffs_into_chunks(files: &[FileDiff], max_tokens: usize) -> Vec<DiffChunk> {
120    let mut chunks = Vec::new();
121    let mut current_chunk = DiffChunk {
122        content: String::new(),
123        files: Vec::new(),
124        token_count: 0,
125    };
126
127    for file in files {
128        // Add file header overhead
129        let header_overhead = estimate_tokens(&format!("diff --git a/{}", file.path)).unwrap_or(5);
130
131        let would_exceed = if current_chunk.content.is_empty() {
132            file.token_count > max_tokens
133        } else {
134            current_chunk.token_count + header_overhead + file.token_count > max_tokens
135        };
136
137        if would_exceed && !current_chunk.content.is_empty() {
138            // Save current chunk and start a new one
139            chunks.push(current_chunk);
140            current_chunk = DiffChunk {
141                content: String::new(),
142                files: Vec::new(),
143                token_count: 0,
144            };
145        }
146
147        // Add file to current chunk
148        if !current_chunk.content.is_empty() {
149            current_chunk.content.push('\n');
150        }
151        current_chunk
152            .content
153            .push_str(&format!("diff --git a/{}\n", file.path));
154        current_chunk.content.push_str(&file.content);
155        current_chunk.files.push(file.path.clone());
156        current_chunk.token_count += header_overhead + file.token_count;
157    }
158
159    // Add the last chunk
160    if !current_chunk.content.is_empty() {
161        chunks.push(current_chunk);
162    }
163
164    chunks
165}
166
167/// Splits a file diff by git hunks (--- a/... / +++ b/ ... @@ ... @@).
168///
169/// This is the second level of chunking - if a single file is too large,
170/// we split it by individual hunks.
171fn split_file_by_hunks(content: &str, max_tokens: usize) -> Vec<String> {
172    let mut hunks = Vec::new();
173    let mut current_hunk = String::new();
174    let mut current_tokens = 0;
175
176    // Track hunk boundaries
177    let hunk_header_pattern = Regex::new(r"^@@ -\d+,\d+ \+\d+,\d+ @@").unwrap();
178
179    for line in content.lines() {
180        let line_tokens = estimate_tokens(line).unwrap_or(1) + 1; // +1 for newline
181
182        // Check if this is the start of a new hunk
183        if hunk_header_pattern.is_match(line) && !current_hunk.is_empty() {
184            hunks.push(current_hunk);
185            current_hunk = String::new();
186            current_tokens = 0;
187        }
188
189        if current_tokens + line_tokens > max_tokens && !current_hunk.is_empty() {
190            hunks.push(current_hunk);
191            current_hunk = String::new();
192            current_tokens = 0;
193        }
194
195        current_hunk.push_str(line);
196        current_hunk.push('\n');
197        current_tokens += line_tokens;
198    }
199
200    if !current_hunk.is_empty() {
201        hunks.push(current_hunk);
202    }
203
204    // If we never entered a hunk, return the original content
205    if hunks.is_empty() && !content.is_empty() {
206        hunks.push(content.to_string());
207    }
208
209    hunks
210}
211
212/// Performs multi-level diff chunking for large diffs.
213///
214/// This function implements a three-tier approach to chunking:
215/// 1. **File-level merging**: Greedily combine entire file diffs until token limit
216/// 2. **Hunk-level splitting**: If a single file is too large, split by hunks
217/// 3. **Line-level splitting**: For extremely large hunks, split by lines
218///
219/// # Arguments
220///
221/// * `diff` - The full git diff string
222/// * `max_tokens` - Maximum tokens allowed per chunk (prompt overhead will be subtracted)
223///
224/// # Returns
225///
226/// A single string containing the chunked diff, with separators between chunks
227pub fn chunk_diff(diff: &str, max_tokens: usize) -> String {
228    // Early return for small diffs
229    let total_tokens = match estimate_tokens(diff) {
230        Ok(t) => t,
231        Err(_) => return diff.to_string(),
232    };
233
234    if total_tokens <= max_tokens {
235        return diff.to_string();
236    }
237
238    // Parse diff into individual files
239    let files = parse_diff_into_files(diff);
240
241    // First try: merge whole files
242    let file_chunks = merge_diffs_into_chunks(&files, max_tokens);
243
244    if file_chunks.len() == 1 {
245        // Single file but still too large - need to split by hunks
246        let chunk = &file_chunks[0];
247        if chunk.token_count > max_tokens {
248            let hunk_chunks = split_file_by_hunks(&chunk.content, max_tokens);
249            let total_hunks = hunk_chunks.len();
250            if total_hunks > 1 {
251                return hunk_chunks
252                    .into_iter()
253                    .enumerate()
254                    .map(|(i, hunk)| {
255                        format!(
256                            "---CHUNK {} OF {}---\n{}\n---END CHUNK---",
257                            i + 1,
258                            total_hunks,
259                            hunk.trim()
260                        )
261                    })
262                    .collect::<Vec<_>>()
263                    .join("\n\n");
264            }
265        }
266        return chunk.content.clone();
267    }
268
269    // Multiple file chunks - join with separator
270    file_chunks
271        .into_iter()
272        .enumerate()
273        .map(|(i, chunk)| {
274            let file_list = chunk.files.join(", ");
275            format!(
276                "---CHUNK {} OF MULTIPLE FILES---\nFiles: {}\n\n{}\n---END CHUNK---",
277                i + 1,
278                file_list,
279                chunk.content.trim()
280            )
281        })
282        .collect::<Vec<_>>()
283        .join("\n\n")
284}
285
286#[cfg(test)]
287mod tests {
288    use super::*;
289
290    #[test]
291    fn test_parse_diff_into_files() {
292        let diff = "diff --git a/src/main.rs b/src/main.rs\n+++ b/src/main.rs\n@@ -1,3 +1,4 @@\n+use std::io;\n fn main() {\n }\n";
293        let files = parse_diff_into_files(diff);
294        assert_eq!(files.len(), 1);
295        assert_eq!(files[0].path, "src/main.rs");
296        assert!(files[0].content.contains("use std::io"));
297    }
298
299    #[test]
300    fn test_chunk_diff_small() {
301        let diff = "diff --git a/src/main.rs b/src/main.rs\n+++ b/src/main.rs\n fn main() {}\n";
302        let result = chunk_diff(diff, 1000);
303        assert_eq!(result, diff);
304    }
305
306    #[test]
307    fn test_parse_diff_header_only() {
308        let diff = "diff --git a/.gitignore b/.gitignore\nnew file mode 100644\n--- /dev/null\n+++ b/.gitignore\n@@ -0,0 +1 @@\n+*.tmp\n";
309        let files = parse_diff_into_files(diff);
310        assert_eq!(files.len(), 1);
311        assert_eq!(files[0].path, ".gitignore");
312    }
313
314    #[test]
315    fn test_parse_diff_deleted_file() {
316        let diff = "diff --git a/old.txt b/old.txt\ndeleted file mode 100644\n--- a/old.txt\n+++ /dev/null\n@@ -1 +0,0 @@\n-old content\n";
317        let files = parse_diff_into_files(diff);
318        assert_eq!(files.len(), 1);
319        assert_eq!(files[0].path, "old.txt");
320    }
321}