1use std::{
5 collections::{HashMap, HashSet, VecDeque},
6 fmt::Write,
7 path::{Path, PathBuf},
8};
9
10use path_absolutize::Absolutize;
11use similar::{Algorithm, ChangeTag, TextDiff};
12
13use crate::hash::ObjectHash;
14
15#[derive(Debug, Clone)]
19pub struct DiffItem {
20 pub path: String,
22 pub data: String,
24}
25
26pub struct Diff;
28
29#[derive(Debug, Clone, PartialEq, Eq)]
31pub enum DiffOperation {
32 Insert { line: usize, content: String },
33 Delete { line: usize },
34 Equal { old_line: usize, new_line: usize },
35}
36
37#[derive(Debug, Clone, Copy)]
39enum EditLine<'a> {
40 Context(Option<usize>, Option<usize>, &'a str),
42 Delete(usize, &'a str),
44 Insert(usize, &'a str),
46}
47
48impl Diff {
49 fn compute_line_operations(old_lines: &[String], new_lines: &[String]) -> Vec<DiffOperation> {
51 if old_lines.is_empty() && new_lines.is_empty() {
52 return Vec::new();
53 }
54
55 let old_refs: Vec<&str> = old_lines.iter().map(|s| s.as_str()).collect();
56 let new_refs: Vec<&str> = new_lines.iter().map(|s| s.as_str()).collect();
57
58 let diff = TextDiff::configure()
59 .algorithm(Algorithm::Myers)
60 .diff_slices(&old_refs, &new_refs);
61
62 let mut operations = Vec::with_capacity(old_lines.len() + new_lines.len());
63 let mut old_line_no = 1usize;
64 let mut new_line_no = 1usize;
65
66 for change in diff.iter_all_changes() {
67 match change.tag() {
68 ChangeTag::Equal => {
69 operations.push(DiffOperation::Equal {
70 old_line: old_line_no,
71 new_line: new_line_no,
72 });
73 old_line_no += 1;
74 new_line_no += 1;
75 }
76 ChangeTag::Delete => {
77 operations.push(DiffOperation::Delete { line: old_line_no });
78 old_line_no += 1;
79 }
80 ChangeTag::Insert => {
81 operations.push(DiffOperation::Insert {
82 line: new_line_no,
83 content: change.value().to_string(),
84 });
85 new_line_no += 1;
86 }
87 }
88 }
89
90 operations
91 }
92
93 const MAX_DIFF_LINES: usize = 10_000; const LARGE_FILE_MARKER: &'static str = "<LargeFile>";
95 const LARGE_FILE_END: &'static str = "</LargeFile>";
96 const SHORT_HASH_LEN: usize = 7;
97
98 pub fn diff<F>(
100 old_blobs: Vec<(PathBuf, ObjectHash)>,
101 new_blobs: Vec<(PathBuf, ObjectHash)>,
102 filter: Vec<PathBuf>,
103 read_content: F,
104 ) -> Vec<DiffItem>
105 where
106 F: Fn(&PathBuf, &ObjectHash) -> Vec<u8>,
107 {
108 let (processed_files, old_blobs_map, new_blobs_map) =
109 Self::prepare_diff_data(old_blobs, new_blobs, &filter);
110
111 let mut diff_results: Vec<DiffItem> = Vec::with_capacity(processed_files.len());
112 for file in processed_files {
113 let old_hash = old_blobs_map.get(&file);
115 let new_hash = new_blobs_map.get(&file);
116 let old_bytes = old_hash.map_or_else(Vec::new, |h| read_content(&file, h));
117 let new_bytes = new_hash.map_or_else(Vec::new, |h| read_content(&file, h));
118
119 if let Some(large_file_marker) =
120 Self::is_large_file_bytes(&file, &old_bytes, &new_bytes)
121 {
122 diff_results.push(DiffItem {
123 path: file.to_string_lossy().to_string(),
124 data: large_file_marker,
125 });
126 } else {
127 let diff = Self::diff_for_file_preloaded(
128 &file, old_hash, new_hash, &old_bytes, &new_bytes,
129 );
130 diff_results.push(DiffItem {
131 path: file.to_string_lossy().to_string(),
132 data: diff,
133 });
134 }
135 }
136
137 diff_results
138 }
139
140 fn is_large_file_bytes(file: &Path, old_bytes: &[u8], new_bytes: &[u8]) -> Option<String> {
142 let old_lines = String::from_utf8_lossy(old_bytes).lines().count();
143 let new_lines = String::from_utf8_lossy(new_bytes).lines().count();
144 let total_lines = old_lines + new_lines;
145 if total_lines > Self::MAX_DIFF_LINES {
146 Some(format!(
147 "{}{}:{}:{}{}\n",
148 Self::LARGE_FILE_MARKER,
149 file.display(),
150 total_lines,
151 Self::MAX_DIFF_LINES,
152 Self::LARGE_FILE_END
153 ))
154 } else {
155 None
156 }
157 }
158
159 fn prepare_diff_data(
161 old_blobs: Vec<(PathBuf, ObjectHash)>,
162 new_blobs: Vec<(PathBuf, ObjectHash)>,
163 filter: &[PathBuf],
164 ) -> (
165 Vec<PathBuf>,
166 HashMap<PathBuf, ObjectHash>,
167 HashMap<PathBuf, ObjectHash>,
168 ) {
169 let old_blobs_map: HashMap<PathBuf, ObjectHash> = old_blobs.into_iter().collect();
170 let new_blobs_map: HashMap<PathBuf, ObjectHash> = new_blobs.into_iter().collect();
171 let union_files: HashSet<PathBuf> = old_blobs_map
173 .keys()
174 .chain(new_blobs_map.keys())
175 .cloned()
176 .collect();
177
178 let processed_files: Vec<PathBuf> = union_files
180 .into_iter()
181 .filter(|file| Self::should_process(file, filter, &old_blobs_map, &new_blobs_map))
182 .collect();
183
184 (processed_files, old_blobs_map, new_blobs_map)
185 }
186
187 fn should_process(
189 file: &PathBuf,
190 filter: &[PathBuf],
191 old_blobs: &HashMap<PathBuf, ObjectHash>,
192 new_blobs: &HashMap<PathBuf, ObjectHash>,
193 ) -> bool {
194 if !filter.is_empty()
195 && !filter
196 .iter()
197 .any(|path| Self::sub_of(file, path).unwrap_or(false))
198 {
199 return false;
200 }
201
202 old_blobs.get(file) != new_blobs.get(file)
203 }
204
205 fn sub_of(path: &PathBuf, parent: &PathBuf) -> Result<bool, std::io::Error> {
207 let path_abs: PathBuf = path.absolutize()?.to_path_buf();
208 let parent_abs: PathBuf = parent.absolutize()?.to_path_buf();
209 Ok(path_abs.starts_with(parent_abs))
210 }
211
212 fn short_hash(hash: Option<&ObjectHash>) -> String {
214 hash.map(|h| {
215 let hex = h.to_string();
216 let take = Self::SHORT_HASH_LEN.min(hex.len());
217 hex[..take].to_string()
218 })
219 .unwrap_or_else(|| "0".repeat(Self::SHORT_HASH_LEN))
220 }
221
222 pub fn diff_for_file_string(
224 file: &PathBuf,
225 old_blobs: &HashMap<PathBuf, ObjectHash>,
226 new_blobs: &HashMap<PathBuf, ObjectHash>,
227 read_content: &dyn Fn(&PathBuf, &ObjectHash) -> Vec<u8>,
228 ) -> String {
229 let new_hash = new_blobs.get(file);
230 let old_hash = old_blobs.get(file);
231 let old_bytes = old_hash.map_or_else(Vec::new, |h| read_content(file, h));
232 let new_bytes = new_hash.map_or_else(Vec::new, |h| read_content(file, h));
233
234 Self::diff_for_file_preloaded(file, old_hash, new_hash, &old_bytes, &new_bytes)
235 }
236
237 fn diff_for_file_preloaded(
239 file: &Path,
240 old_hash: Option<&ObjectHash>,
241 new_hash: Option<&ObjectHash>,
242 old_bytes: &[u8],
243 new_bytes: &[u8],
244 ) -> String {
245 let mut out = String::new();
246
247 let _ = writeln!(out, "diff --git a/{} b/{}", file.display(), file.display());
249
250 if old_hash.is_none() {
251 let _ = writeln!(out, "new file mode 100644");
252 } else if new_hash.is_none() {
253 let _ = writeln!(out, "deleted file mode 100644");
254 }
255
256 let old_index = Self::short_hash(old_hash);
257 let new_index = Self::short_hash(new_hash);
258 let _ = writeln!(out, "index {old_index}..{new_index} 100644");
259
260 match (
261 std::str::from_utf8(old_bytes),
262 std::str::from_utf8(new_bytes),
263 ) {
264 (Ok(old_text), Ok(new_text)) => {
265 let (old_pref, new_pref) = if old_text.is_empty() {
266 ("/dev/null".to_string(), format!("b/{}", file.display()))
267 } else if new_text.is_empty() {
268 (format!("a/{}", file.display()), "/dev/null".to_string())
269 } else {
270 (
271 format!("a/{}", file.display()),
272 format!("b/{}", file.display()),
273 )
274 };
275
276 let _ = writeln!(out, "--- {old_pref}");
277 let _ = writeln!(out, "+++ {new_pref}");
278
279 let unified = Self::compute_unified_diff(old_text, new_text, 3);
280 out.push_str(&unified);
281 }
282 _ => {
283 let _ = writeln!(out, "Binary files differ");
284 }
285 }
286
287 out
288 }
289
290 fn compute_unified_diff(old_text: &str, new_text: &str, context: usize) -> String {
292 let diff = TextDiff::configure()
294 .algorithm(Algorithm::Myers)
295 .diff_lines(old_text, new_text);
296
297 let mut out = String::with_capacity(((old_text.len() + new_text.len()) / 16).max(4096));
299
300 let mut prefix_ctx: VecDeque<EditLine> = VecDeque::with_capacity(context);
302 let mut cur_hunk: Vec<EditLine> = Vec::new();
303 let mut eq_run: Vec<EditLine> = Vec::new(); let mut in_hunk = false;
305
306 let mut last_old_seen = 0usize;
307 let mut last_new_seen = 0usize;
308 let mut old_line_no = 1usize;
309 let mut new_line_no = 1usize;
310
311 for change in diff.iter_all_changes() {
312 let line = change.value().trim_end_matches(['\r', '\n']);
313 match change.tag() {
314 ChangeTag::Equal => {
315 let entry = EditLine::Context(Some(old_line_no), Some(new_line_no), line);
316 old_line_no += 1;
317 new_line_no += 1;
318 if in_hunk {
319 eq_run.push(entry);
320 if eq_run.len() > context * 2 {
322 Self::flush_hunk_to_out(
323 &mut out,
324 &mut cur_hunk,
325 &mut eq_run,
326 &mut prefix_ctx,
327 context,
328 &mut last_old_seen,
329 &mut last_new_seen,
330 );
331 in_hunk = false;
332 }
333 } else {
334 if prefix_ctx.len() == context {
335 prefix_ctx.pop_front();
336 }
337 prefix_ctx.push_back(entry);
338 }
339 }
340 ChangeTag::Delete => {
341 let entry = EditLine::Delete(old_line_no, line);
342 old_line_no += 1;
343 if !in_hunk {
344 cur_hunk.extend(prefix_ctx.iter().copied());
345 prefix_ctx.clear();
346 in_hunk = true;
347 }
348 if !eq_run.is_empty() {
349 cur_hunk.append(&mut eq_run);
350 }
351 cur_hunk.push(entry);
352 }
353 ChangeTag::Insert => {
354 let entry = EditLine::Insert(new_line_no, line);
355 new_line_no += 1;
356 if !in_hunk {
357 cur_hunk.extend(prefix_ctx.iter().copied());
358 prefix_ctx.clear();
359 in_hunk = true;
360 }
361 if !eq_run.is_empty() {
362 cur_hunk.append(&mut eq_run);
363 }
364 cur_hunk.push(entry);
365 }
366 }
367 }
368
369 if in_hunk {
370 Self::flush_hunk_to_out(
371 &mut out,
372 &mut cur_hunk,
373 &mut eq_run,
374 &mut prefix_ctx,
375 context,
376 &mut last_old_seen,
377 &mut last_new_seen,
378 );
379 }
380
381 out
382 }
383
384 fn flush_hunk_to_out<'a>(
386 out: &mut String,
387 cur_hunk: &mut Vec<EditLine<'a>>,
388 eq_run: &mut Vec<EditLine<'a>>,
389 prefix_ctx: &mut VecDeque<EditLine<'a>>,
390 context: usize,
391 last_old_seen: &mut usize,
392 last_new_seen: &mut usize,
393 ) {
394 let trail_to_take = eq_run.len().min(context);
396 for entry in eq_run.iter().take(trail_to_take) {
397 cur_hunk.push(*entry);
398 }
399
400 let mut old_first: Option<usize> = None;
402 let mut old_count: usize = 0;
403 let mut new_first: Option<usize> = None;
404 let mut new_count: usize = 0;
405
406 for e in cur_hunk.iter() {
407 match *e {
408 EditLine::Context(o, n, _) => {
409 if let Some(o) = o {
410 if old_first.is_none() {
411 old_first = Some(o);
412 }
413 old_count += 1;
414 }
415 if let Some(n) = n {
416 if new_first.is_none() {
417 new_first = Some(n);
418 }
419 new_count += 1;
420 }
421 }
422 EditLine::Delete(o, _) => {
423 if old_first.is_none() {
424 old_first = Some(o);
425 }
426 old_count += 1;
427 }
428 EditLine::Insert(n, _) => {
429 if new_first.is_none() {
430 new_first = Some(n);
431 }
432 new_count += 1;
433 }
434 }
435 }
436
437 if old_count == 0 && new_count == 0 {
438 cur_hunk.clear();
439 eq_run.clear();
440 return;
441 }
442
443 let old_start = old_first.unwrap_or(*last_old_seen + 1);
444 let new_start = new_first.unwrap_or(*last_new_seen + 1);
445
446 let _ = writeln!(
447 out,
448 "@@ -{old_start},{old_count} +{new_start},{new_count} @@"
449 );
450
451 for &e in cur_hunk.iter() {
453 match e {
454 EditLine::Context(o, n, txt) => {
455 let _ = writeln!(out, " {txt}");
456 if let Some(o) = o {
457 *last_old_seen = (*last_old_seen).max(o);
458 }
459 if let Some(n) = n {
460 *last_new_seen = (*last_new_seen).max(n);
461 }
462 }
463 EditLine::Delete(o, txt) => {
464 let _ = writeln!(out, "-{txt}");
465 *last_old_seen = (*last_old_seen).max(o);
466 }
467 EditLine::Insert(n, txt) => {
468 let _ = writeln!(out, "+{txt}");
469 *last_new_seen = (*last_new_seen).max(n);
470 }
471 }
472 }
473
474 prefix_ctx.clear();
476 if context > 0 {
477 let keep_start = eq_run.len().saturating_sub(context);
478 for entry in eq_run.iter().skip(keep_start) {
479 prefix_ctx.push_back(*entry);
480 }
481 }
482
483 cur_hunk.clear();
484 eq_run.clear();
485 }
486}
487
488pub fn compute_diff(old_lines: &[String], new_lines: &[String]) -> Vec<DiffOperation> {
490 Diff::compute_line_operations(old_lines, new_lines)
491}
492
493#[cfg(test)]
494mod tests {
495 use std::{collections::HashMap, fs, path::PathBuf, process::Command};
496
497 use tempfile::tempdir;
498
499 use super::{Diff, DiffOperation, compute_diff};
500 use crate::hash::{HashKind, ObjectHash, set_hash_kind_for_test};
501
502 fn run_diff(
504 logical_path: &str,
505 old_bytes: &[u8],
506 new_bytes: &[u8],
507 ) -> (String, ObjectHash, ObjectHash) {
508 let file = PathBuf::from(logical_path);
509 let old_hash = ObjectHash::new(old_bytes);
510 let new_hash = ObjectHash::new(new_bytes);
511
512 let mut blob_store: HashMap<ObjectHash, Vec<u8>> = HashMap::new();
513 blob_store.insert(old_hash, old_bytes.to_vec());
514 blob_store.insert(new_hash, new_bytes.to_vec());
515
516 let mut old_map = HashMap::new();
517 let mut new_map = HashMap::new();
518 old_map.insert(file.clone(), old_hash);
519 new_map.insert(file.clone(), new_hash);
520
521 let reader = |_: &PathBuf, h: &ObjectHash| -> Vec<u8> {
522 blob_store.get(h).cloned().unwrap_or_default()
523 };
524
525 let diff = Diff::diff_for_file_string(&file, &old_map, &new_map, &reader);
526 (diff, old_hash, new_hash)
527 }
528
529 fn short_hash(hash: &ObjectHash) -> String {
531 hash.to_string().chars().take(7).collect()
532 }
533
534 fn normalized_git_diff(
536 logical_path: &str,
537 old_bytes: &[u8],
538 new_bytes: &[u8],
539 old_hash: &ObjectHash,
540 new_hash: &ObjectHash,
541 ) -> Option<String> {
542 let temp_dir = tempdir().ok()?;
543 let old_file = temp_dir.path().join("old.txt");
544 let new_file = temp_dir.path().join("new.txt");
545
546 fs::write(&old_file, old_bytes).ok()?;
547 fs::write(&new_file, new_bytes).ok()?;
548
549 let output = Command::new("git")
550 .current_dir(temp_dir.path())
551 .args(["diff", "--no-index", "--unified=3", "old.txt", "new.txt"])
552 .output()
553 .ok()?;
554
555 let stdout = String::from_utf8_lossy(&output.stdout);
556 if stdout.is_empty() {
557 return None;
558 }
559
560 let short_old = short_hash(old_hash);
561 let short_new = short_hash(new_hash);
562
563 let mut normalized = Vec::new();
564 for line in stdout.lines() {
565 let rewritten = if line.starts_with("diff --git ") {
566 format!("diff --git a/{logical_path} b/{logical_path}")
567 } else if line.starts_with("index ") {
568 format!("index {short_old}..{short_new} 100644")
569 } else if line.starts_with("--- ") {
570 format!("--- a/{logical_path}")
571 } else if line.starts_with("+++ ") {
572 format!("+++ b/{logical_path}")
573 } else if line.starts_with("@@") {
574 match line.rfind("@@") {
575 Some(pos) if pos + 2 <= line.len() => line[..pos + 2].to_string(),
576 _ => line.to_string(),
577 }
578 } else {
579 line.to_string()
580 };
581 normalized.push(rewritten);
582 }
583
584 Some(normalized.join("\n") + "\n")
585 }
586
587 #[test]
589 fn unified_diff_basic_changes() {
590 let _guard = set_hash_kind_for_test(HashKind::Sha256);
591 let old = b"a\nb\nc\n" as &[u8];
592 let new = b"a\nB\nc\nd\n" as &[u8];
593 let (diff, _, _) = run_diff("foo.txt", old, new);
594
595 assert!(diff.contains("diff --git a/foo.txt b/foo.txt"));
596 assert!(diff.contains("index "));
597 assert!(diff.contains("--- a/foo.txt"));
598 assert!(diff.contains("+++ b/foo.txt"));
599 assert!(diff.contains("@@"));
600 assert!(diff.contains("-b"));
601 assert!(diff.contains("+B"));
602 assert!(diff.contains("+d"));
603 }
604
605 #[test]
607 fn binary_files_detection() {
608 let _guard = set_hash_kind_for_test(HashKind::Sha256);
609 let old_bytes = vec![0u8, 159, 146, 150];
610 let new_bytes = vec![0xFF, 0x00, 0x01];
611 let (diff, _, _) = run_diff("bin.dat", &old_bytes, &new_bytes);
612 assert!(diff.contains("Binary files differ"));
613 }
614
615 #[test]
617 fn diff_matches_git_for_fixture() {
618 let _guard = set_hash_kind_for_test(HashKind::Sha256); let base: PathBuf = [env!("CARGO_MANIFEST_DIR"), "tests", "diff"]
620 .iter()
621 .collect();
622 let old_bytes = fs::read(base.join("old.txt")).expect("read old.txt");
623 let new_bytes = fs::read(base.join("new.txt")).expect("read new.txt");
624
625 let (diff_output, old_hash, new_hash) = run_diff("fixture.txt", &old_bytes, &new_bytes);
626 let git_output =
627 normalized_git_diff("fixture.txt", &old_bytes, &new_bytes, &old_hash, &new_hash)
628 .expect("git diff output");
629
630 fn collect(s: &str, prefix: char) -> Vec<String> {
631 s.lines()
632 .filter(|l| l.starts_with(prefix))
633 .map(|l| l.to_string())
634 .collect()
635 }
636 let ours_del = collect(&diff_output, '-');
637 let ours_ins = collect(&diff_output, '+');
638 let git_del = collect(&git_output, '-');
639 let git_ins = collect(&git_output, '+');
640
641 use std::collections::HashSet;
642 let ours_del_set: HashSet<_> = ours_del.iter().collect();
643 let git_del_set: HashSet<_> = git_del.iter().collect();
644 let ours_ins_set: HashSet<_> = ours_ins.iter().collect();
645 let git_ins_set: HashSet<_> = git_ins.iter().collect();
646
647 assert_eq!(
648 ours_del_set, git_del_set,
649 "deleted lines differ from git output"
650 );
651 assert_eq!(
652 ours_ins_set, git_ins_set,
653 "inserted lines differ from git output"
654 );
655 }
656
657 #[test]
659 fn diff_matches_git_for_large_change() {
660 let _guard = set_hash_kind_for_test(HashKind::Sha256);
661 let old_lines: Vec<String> = (0..5_000).map(|i| format!("line {i}")).collect();
662 let mut new_lines = old_lines.clone();
663 for idx in [10, 499, 1_234, 3_210, 4_999] {
664 new_lines[idx] = format!("updated line {idx}");
665 }
666 new_lines.insert(2_500, "inserted middle line".into());
667 new_lines.push("new tail line".into());
668
669 let old_text = old_lines.join("\n") + "\n";
670 let new_text = new_lines.join("\n") + "\n";
671
672 let (diff_output, old_hash, new_hash) = run_diff(
673 "large_fixture.txt",
674 old_text.as_bytes(),
675 new_text.as_bytes(),
676 );
677 let git_output = normalized_git_diff(
678 "large_fixture.txt",
679 old_text.as_bytes(),
680 new_text.as_bytes(),
681 &old_hash,
682 &new_hash,
683 )
684 .expect("git diff output");
685
686 fn collect(s: &str, prefix: char) -> Vec<String> {
687 s.lines()
688 .filter(|l| l.starts_with(prefix))
689 .map(|l| l.to_string())
690 .collect()
691 }
692 use std::collections::HashSet;
693 let ours_del: HashSet<_> = collect(&diff_output, '-').into_iter().collect();
694 let ours_ins: HashSet<_> = collect(&diff_output, '+').into_iter().collect();
695 let git_del: HashSet<_> = collect(&git_output, '-').into_iter().collect();
696 let git_ins: HashSet<_> = collect(&git_output, '+').into_iter().collect();
697 assert_eq!(ours_del, git_del, "deleted lines differ from git output");
698 assert_eq!(ours_ins, git_ins, "inserted lines differ from git output");
699 }
700
701 #[test]
703 fn compute_diff_operations_basic_mapping() {
704 let _guard = set_hash_kind_for_test(HashKind::Sha256);
705 let old_lines = vec!["a".to_string(), "b".to_string(), "c".to_string()];
706 let new_lines = vec![
707 "a".to_string(),
708 "B".to_string(),
709 "c".to_string(),
710 "d".to_string(),
711 ];
712
713 let ops = compute_diff(&old_lines, &new_lines);
714
715 let expected = vec![
716 DiffOperation::Equal {
717 old_line: 1,
718 new_line: 1,
719 },
720 DiffOperation::Delete { line: 2 },
721 DiffOperation::Insert {
722 line: 2,
723 content: "B".to_string(),
724 },
725 DiffOperation::Equal {
726 old_line: 3,
727 new_line: 3,
728 },
729 DiffOperation::Insert {
730 line: 4,
731 content: "d".to_string(),
732 },
733 ];
734
735 assert_eq!(ops, expected);
736 }
737}