Skip to main content

omni_dev/git/
diff_split.rs

1//! Per-file and per-hunk unified diff splitting.
2
3/// Marker that begins a per-file section in unified diff output.
4const FILE_DIFF_MARKER: &str = "diff --git a/";
5
6/// Marker that begins a hunk within a file diff.
7const HUNK_MARKER: &str = "@@ ";
8
9/// A per-file slice of a unified diff.
10#[derive(Debug, Clone)]
11pub struct FileDiff {
12    /// Path of the file (extracted from the `b/` side of `diff --git a/... b/...`).
13    pub path: String,
14    /// Raw text of this file's diff (header + all hunks).
15    pub content: String,
16    /// Byte length of `content`.
17    pub byte_len: usize,
18}
19
20/// A single hunk within one file's diff.
21#[derive(Debug, Clone)]
22pub struct HunkDiff {
23    /// The file header lines (`diff --git`, `index`, `---`, `+++`).
24    pub file_header: String,
25    /// Raw text of this hunk starting from the `@@` line.
26    pub content: String,
27    /// Byte length of `file_header` + `content` combined.
28    pub byte_len: usize,
29}
30
31/// Splits a flat unified diff at `diff --git a/` boundaries.
32///
33/// Returns one [`FileDiff`] for each file section found in the input.
34/// An empty or whitespace-only input returns an empty `Vec`.
35pub fn split_by_file(diff: &str) -> Vec<FileDiff> {
36    let mut result = Vec::new();
37    let mut positions = Vec::new();
38
39    // Find all positions where a file section starts (at line boundaries).
40    if diff.starts_with(FILE_DIFF_MARKER) {
41        positions.push(0);
42    }
43    let search = format!("\n{FILE_DIFF_MARKER}");
44    let mut start = 0;
45    while let Some(pos) = diff[start..].find(&search) {
46        // +1 to skip the newline; the section starts at `diff`.
47        positions.push(start + pos + 1);
48        start = start + pos + 1;
49    }
50
51    for (i, &pos) in positions.iter().enumerate() {
52        let end = positions.get(i + 1).copied().unwrap_or(diff.len());
53        let content = &diff[pos..end];
54        let first_line = content.lines().next().unwrap_or("");
55        let path = extract_path_from_diff_header(first_line);
56
57        result.push(FileDiff {
58            path,
59            content: content.to_string(),
60            byte_len: content.len(),
61        });
62    }
63
64    result
65}
66
67/// Splits a [`FileDiff`] into per-hunk segments.
68///
69/// Each [`HunkDiff`] includes the file header so it is self-contained.
70/// A file with no hunks (e.g., binary files, mode-only changes) returns
71/// an empty `Vec`.
72pub fn split_file_by_hunk(file_diff: &FileDiff) -> Vec<HunkDiff> {
73    let content = &file_diff.content;
74    let mut hunk_positions = Vec::new();
75
76    // Find all positions where a hunk starts (at line boundaries).
77    if content.starts_with(HUNK_MARKER) {
78        hunk_positions.push(0);
79    }
80    let search = format!("\n{HUNK_MARKER}");
81    let mut start = 0;
82    while let Some(pos) = content[start..].find(&search) {
83        hunk_positions.push(start + pos + 1);
84        start = start + pos + 1;
85    }
86
87    if hunk_positions.is_empty() {
88        return Vec::new();
89    }
90
91    // Everything before the first hunk is the file header.
92    let file_header = &content[..hunk_positions[0]];
93
94    let mut result = Vec::new();
95    for (i, &pos) in hunk_positions.iter().enumerate() {
96        let end = hunk_positions.get(i + 1).copied().unwrap_or(content.len());
97        let hunk_content = &content[pos..end];
98        let byte_len = file_header.len() + hunk_content.len();
99
100        result.push(HunkDiff {
101            file_header: file_header.to_string(),
102            content: hunk_content.to_string(),
103            byte_len,
104        });
105    }
106
107    result
108}
109
110/// Extracts the file path from the `b/` side of a `diff --git` header line.
111fn extract_path_from_diff_header(header_line: &str) -> String {
112    // Format: "diff --git a/old_path b/new_path"
113    // Find the last " b/" to handle paths that may contain spaces.
114    if let Some(b_pos) = header_line.rfind(" b/") {
115        header_line[b_pos + 3..].to_string()
116    } else {
117        // Fallback: try to extract from after "diff --git a/".
118        header_line
119            .strip_prefix(FILE_DIFF_MARKER)
120            .unwrap_or(header_line)
121            .to_string()
122    }
123}
124
125#[cfg(test)]
126#[allow(clippy::unwrap_used, clippy::expect_used)]
127mod tests {
128    use super::*;
129
130    // ── test helpers ────────────────────────────────────────────
131
132    /// Builds a standard single-file diff header.
133    fn make_file_header(path: &str) -> String {
134        format!(
135            "diff --git a/{path} b/{path}\n\
136             index abc1234..def5678 100644\n\
137             --- a/{path}\n\
138             +++ b/{path}\n"
139        )
140    }
141
142    /// Builds a single hunk string.
143    fn make_hunk(
144        old_start: usize,
145        old_count: usize,
146        new_start: usize,
147        new_count: usize,
148        body: &str,
149    ) -> String {
150        format!("@@ -{old_start},{old_count} +{new_start},{new_count} @@\n{body}")
151    }
152
153    /// Builds a complete single-file, single-hunk diff.
154    fn make_single_file_diff(path: &str, hunk_body: &str) -> String {
155        format!(
156            "{}{}",
157            make_file_header(path),
158            make_hunk(1, 3, 1, 4, hunk_body)
159        )
160    }
161
162    // ── split_by_file ──────────────────────────────────────────
163
164    #[test]
165    fn split_by_file_empty_input() {
166        let result = split_by_file("");
167        assert!(result.is_empty());
168    }
169
170    #[test]
171    fn split_by_file_whitespace_only() {
172        let result = split_by_file("   \n\n  \t  ");
173        assert!(result.is_empty());
174    }
175
176    #[test]
177    fn split_by_file_no_diff_markers() {
178        let result = split_by_file("some random text\nwithout diff markers\n");
179        assert!(result.is_empty());
180    }
181
182    #[test]
183    fn split_by_file_single_file_single_hunk() {
184        let diff = make_single_file_diff(
185            "src/main.rs",
186            " fn main() {\n+    println!(\"hello\");\n }\n",
187        );
188        let result = split_by_file(&diff);
189        assert_eq!(result.len(), 1);
190        assert_eq!(result[0].path, "src/main.rs");
191        assert_eq!(result[0].content, diff);
192    }
193
194    #[test]
195    fn split_by_file_single_file_multiple_hunks() {
196        let header = make_file_header("lib.rs");
197        let hunk1 = make_hunk(1, 3, 1, 4, "+use std::io;\n");
198        let hunk2 = make_hunk(10, 2, 11, 3, "+// new comment\n");
199        let diff = format!("{header}{hunk1}{hunk2}");
200
201        let result = split_by_file(&diff);
202        assert_eq!(result.len(), 1);
203        assert_eq!(result[0].path, "lib.rs");
204        assert!(result[0].content.contains("@@ -1,3 +1,4 @@"));
205        assert!(result[0].content.contains("@@ -10,2 +11,3 @@"));
206    }
207
208    #[test]
209    fn split_by_file_multiple_files() {
210        let file1 = make_single_file_diff("a.rs", "+line\n");
211        let file2 = make_single_file_diff("b.rs", "+other\n");
212        let file3 = make_single_file_diff("c.rs", "+third\n");
213        let diff = format!("{file1}{file2}{file3}");
214
215        let result = split_by_file(&diff);
216        assert_eq!(result.len(), 3);
217        assert_eq!(result[0].path, "a.rs");
218        assert_eq!(result[1].path, "b.rs");
219        assert_eq!(result[2].path, "c.rs");
220    }
221
222    #[test]
223    fn split_by_file_binary_marker() {
224        let diff = "diff --git a/image.png b/image.png\n\
225                     new file mode 100644\n\
226                     index 0000000..abc1234\n\
227                     Binary files /dev/null and b/image.png differ\n";
228
229        let result = split_by_file(diff);
230        assert_eq!(result.len(), 1);
231        assert_eq!(result[0].path, "image.png");
232        assert!(result[0].content.contains("Binary files"));
233    }
234
235    #[test]
236    fn split_by_file_rename() {
237        let diff = "diff --git a/old_name.rs b/new_name.rs\n\
238                     similarity index 95%\n\
239                     rename from old_name.rs\n\
240                     rename to new_name.rs\n\
241                     index abc1234..def5678 100644\n\
242                     --- a/old_name.rs\n\
243                     +++ b/new_name.rs\n\
244                     @@ -1,3 +1,3 @@\n\
245                     -// old\n\
246                     +// new\n";
247
248        let result = split_by_file(diff);
249        assert_eq!(result.len(), 1);
250        assert_eq!(result[0].path, "new_name.rs");
251    }
252
253    #[test]
254    fn split_by_file_byte_len_matches_content() {
255        let file1 = make_single_file_diff("a.rs", "+line\n");
256        let file2 = make_single_file_diff("b.rs", "+other\n");
257        let diff = format!("{file1}{file2}");
258
259        let result = split_by_file(&diff);
260        for file_diff in &result {
261            assert_eq!(file_diff.byte_len, file_diff.content.len());
262        }
263    }
264
265    #[test]
266    fn split_by_file_content_preserved_verbatim() {
267        let file1 = make_single_file_diff("a.rs", "+line\n");
268        let file2 = make_single_file_diff("b.rs", "+other\n");
269        let diff = format!("{file1}{file2}");
270
271        let result = split_by_file(&diff);
272        let rejoined: String = result.iter().map(|f| f.content.as_str()).collect();
273        assert_eq!(rejoined, diff);
274    }
275
276    // ── split_file_by_hunk ─────────────────────────────────────
277
278    #[test]
279    fn split_file_by_hunk_no_hunks() {
280        let file_diff = FileDiff {
281            path: "image.png".to_string(),
282            content: "diff --git a/image.png b/image.png\n\
283                      new file mode 100644\n\
284                      index 0000000..abc1234\n\
285                      Binary files /dev/null and b/image.png differ\n"
286                .to_string(),
287            byte_len: 0, // not relevant for this test
288        };
289        let result = split_file_by_hunk(&file_diff);
290        assert!(result.is_empty());
291    }
292
293    #[test]
294    fn split_file_by_hunk_single_hunk() {
295        let content =
296            make_single_file_diff("main.rs", " fn main() {\n+    println!(\"hi\");\n }\n");
297        let file_diff = FileDiff {
298            path: "main.rs".to_string(),
299            byte_len: content.len(),
300            content,
301        };
302
303        let result = split_file_by_hunk(&file_diff);
304        assert_eq!(result.len(), 1);
305        assert!(result[0].content.starts_with(HUNK_MARKER));
306        assert!(result[0].file_header.starts_with("diff --git"));
307    }
308
309    #[test]
310    fn split_file_by_hunk_multiple_hunks() {
311        let header = make_file_header("lib.rs");
312        let hunk1 = make_hunk(1, 3, 1, 4, "+use std::io;\n");
313        let hunk2 = make_hunk(10, 2, 11, 3, "+// comment\n");
314        let hunk3 = make_hunk(20, 1, 22, 2, "+fn new() {}\n");
315        let content = format!("{header}{hunk1}{hunk2}{hunk3}");
316        let file_diff = FileDiff {
317            path: "lib.rs".to_string(),
318            byte_len: content.len(),
319            content,
320        };
321
322        let result = split_file_by_hunk(&file_diff);
323        assert_eq!(result.len(), 3);
324    }
325
326    #[test]
327    fn split_file_by_hunk_header_included() {
328        let header = make_file_header("lib.rs");
329        let hunk1 = make_hunk(1, 3, 1, 4, "+line\n");
330        let hunk2 = make_hunk(10, 2, 11, 3, "+other\n");
331        let content = format!("{header}{hunk1}{hunk2}");
332        let file_diff = FileDiff {
333            path: "lib.rs".to_string(),
334            byte_len: content.len(),
335            content,
336        };
337
338        let result = split_file_by_hunk(&file_diff);
339        for hunk in &result {
340            assert!(
341                hunk.file_header.contains("diff --git"),
342                "file_header should contain the diff --git line"
343            );
344            assert!(
345                hunk.file_header.contains("--- a/"),
346                "file_header should contain the --- line"
347            );
348            assert!(
349                hunk.file_header.contains("+++ b/"),
350                "file_header should contain the +++ line"
351            );
352        }
353    }
354
355    #[test]
356    fn split_file_by_hunk_content_starts_with_at() {
357        let header = make_file_header("lib.rs");
358        let hunk1 = make_hunk(1, 3, 1, 4, "+line\n");
359        let hunk2 = make_hunk(10, 2, 11, 3, "+other\n");
360        let content = format!("{header}{hunk1}{hunk2}");
361        let file_diff = FileDiff {
362            path: "lib.rs".to_string(),
363            byte_len: content.len(),
364            content,
365        };
366
367        let result = split_file_by_hunk(&file_diff);
368        for hunk in &result {
369            assert!(
370                hunk.content.starts_with(HUNK_MARKER),
371                "hunk content should start with '{}', got: {:?}",
372                HUNK_MARKER,
373                &hunk.content[..hunk.content.len().min(20)]
374            );
375        }
376    }
377
378    #[test]
379    fn split_file_by_hunk_byte_len_is_header_plus_content() {
380        let header = make_file_header("lib.rs");
381        let hunk1 = make_hunk(1, 3, 1, 4, "+line\n");
382        let hunk2 = make_hunk(10, 2, 11, 3, "+other\n");
383        let content = format!("{header}{hunk1}{hunk2}");
384        let file_diff = FileDiff {
385            path: "lib.rs".to_string(),
386            byte_len: content.len(),
387            content,
388        };
389
390        let result = split_file_by_hunk(&file_diff);
391        for hunk in &result {
392            assert_eq!(
393                hunk.byte_len,
394                hunk.file_header.len() + hunk.content.len(),
395                "byte_len should equal file_header.len() + content.len()"
396            );
397        }
398    }
399
400    #[test]
401    fn split_file_by_hunk_mode_change_only() {
402        let content = "diff --git a/script.sh b/script.sh\n\
403                        old mode 100644\n\
404                        new mode 100755\n"
405            .to_string();
406        let file_diff = FileDiff {
407            path: "script.sh".to_string(),
408            byte_len: content.len(),
409            content,
410        };
411
412        let result = split_file_by_hunk(&file_diff);
413        assert!(result.is_empty());
414    }
415
416    // ── extract_path_from_diff_header ──────────────────────────
417
418    #[test]
419    fn path_extraction_simple() {
420        assert_eq!(
421            extract_path_from_diff_header("diff --git a/foo.rs b/foo.rs"),
422            "foo.rs"
423        );
424    }
425
426    #[test]
427    fn path_extraction_nested() {
428        assert_eq!(
429            extract_path_from_diff_header("diff --git a/src/git/diff.rs b/src/git/diff.rs"),
430            "src/git/diff.rs"
431        );
432    }
433
434    #[test]
435    fn path_extraction_rename() {
436        assert_eq!(
437            extract_path_from_diff_header("diff --git a/old.rs b/new.rs"),
438            "new.rs"
439        );
440    }
441
442    #[test]
443    fn path_extraction_with_spaces() {
444        assert_eq!(
445            extract_path_from_diff_header("diff --git a/my file.rs b/my file.rs"),
446            "my file.rs"
447        );
448    }
449
450    // ── roundtrip ──────────────────────────────────────────────
451
452    #[test]
453    fn roundtrip_split_and_rejoin() {
454        let file1 = make_single_file_diff("a.rs", "+line1\n");
455        let file2 = make_single_file_diff("b.rs", "+line2\n");
456        let file3 = make_single_file_diff("c.rs", "+line3\n");
457        let original = format!("{file1}{file2}{file3}");
458
459        let files = split_by_file(&original);
460        let rejoined: String = files.iter().map(|f| f.content.as_str()).collect();
461        assert_eq!(rejoined, original);
462    }
463}