1use crate::hash::ObjectHash;
2use path_absolutize::Absolutize;
3use similar::{Algorithm, ChangeTag, TextDiff};
4use std::collections::{HashMap, HashSet, VecDeque};
5use std::fmt::Write;
6use std::path::{Path, PathBuf};
7
8#[derive(Debug, Clone)]
12pub struct DiffItem {
13 pub path: String,
15 pub data: String,
17}
18
19pub struct Diff;
20
21#[derive(Debug, Clone, PartialEq, Eq)]
23pub enum DiffOperation {
24 Insert { line: usize, content: String },
25 Delete { line: usize },
26 Equal { old_line: usize, new_line: usize },
27}
28
29#[derive(Debug, Clone, Copy)]
30enum EditLine<'a> {
31 Context(Option<usize>, Option<usize>, &'a str),
33 Delete(usize, &'a str),
35 Insert(usize, &'a str),
37}
38
39impl Diff {
40 fn compute_line_operations(old_lines: &[String], new_lines: &[String]) -> Vec<DiffOperation> {
41 if old_lines.is_empty() && new_lines.is_empty() {
42 return Vec::new();
43 }
44
45 let old_refs: Vec<&str> = old_lines.iter().map(|s| s.as_str()).collect();
46 let new_refs: Vec<&str> = new_lines.iter().map(|s| s.as_str()).collect();
47
48 let diff = TextDiff::configure()
49 .algorithm(Algorithm::Myers)
50 .diff_slices(&old_refs, &new_refs);
51
52 let mut operations = Vec::with_capacity(old_lines.len() + new_lines.len());
53 let mut old_line_no = 1usize;
54 let mut new_line_no = 1usize;
55
56 for change in diff.iter_all_changes() {
57 match change.tag() {
58 ChangeTag::Equal => {
59 operations.push(DiffOperation::Equal {
60 old_line: old_line_no,
61 new_line: new_line_no,
62 });
63 old_line_no += 1;
64 new_line_no += 1;
65 }
66 ChangeTag::Delete => {
67 operations.push(DiffOperation::Delete { line: old_line_no });
68 old_line_no += 1;
69 }
70 ChangeTag::Insert => {
71 operations.push(DiffOperation::Insert {
72 line: new_line_no,
73 content: change.value().to_string(),
74 });
75 new_line_no += 1;
76 }
77 }
78 }
79
80 operations
81 }
82
83 const MAX_DIFF_LINES: usize = 10_000; const LARGE_FILE_MARKER: &'static str = "<LargeFile>";
85 const LARGE_FILE_END: &'static str = "</LargeFile>";
86 const SHORT_HASH_LEN: usize = 7;
87
88 pub fn diff<F>(
90 old_blobs: Vec<(PathBuf, ObjectHash)>,
91 new_blobs: Vec<(PathBuf, ObjectHash)>,
92 filter: Vec<PathBuf>,
93 read_content: F,
94 ) -> Vec<DiffItem>
95 where
96 F: Fn(&PathBuf, &ObjectHash) -> Vec<u8>,
97 {
98 let (processed_files, old_blobs_map, new_blobs_map) =
99 Self::prepare_diff_data(old_blobs, new_blobs, &filter);
100
101 let mut diff_results: Vec<DiffItem> = Vec::with_capacity(processed_files.len());
102 for file in processed_files {
103 let old_hash = old_blobs_map.get(&file);
105 let new_hash = new_blobs_map.get(&file);
106 let old_bytes = old_hash.map_or_else(Vec::new, |h| read_content(&file, h));
107 let new_bytes = new_hash.map_or_else(Vec::new, |h| read_content(&file, h));
108
109 if let Some(large_file_marker) =
110 Self::is_large_file_bytes(&file, &old_bytes, &new_bytes)
111 {
112 diff_results.push(DiffItem {
113 path: file.to_string_lossy().to_string(),
114 data: large_file_marker,
115 });
116 } else {
117 let diff = Self::diff_for_file_preloaded(
118 &file, old_hash, new_hash, &old_bytes, &new_bytes,
119 );
120 diff_results.push(DiffItem {
121 path: file.to_string_lossy().to_string(),
122 data: diff,
123 });
124 }
125 }
126
127 diff_results
128 }
129
130 fn is_large_file_bytes(file: &Path, old_bytes: &[u8], new_bytes: &[u8]) -> Option<String> {
132 let old_lines = String::from_utf8_lossy(old_bytes).lines().count();
133 let new_lines = String::from_utf8_lossy(new_bytes).lines().count();
134 let total_lines = old_lines + new_lines;
135 if total_lines > Self::MAX_DIFF_LINES {
136 Some(format!(
137 "{}{}:{}:{}{}\n",
138 Self::LARGE_FILE_MARKER,
139 file.display(),
140 total_lines,
141 Self::MAX_DIFF_LINES,
142 Self::LARGE_FILE_END
143 ))
144 } else {
145 None
146 }
147 }
148
149 fn prepare_diff_data(
151 old_blobs: Vec<(PathBuf, ObjectHash)>,
152 new_blobs: Vec<(PathBuf, ObjectHash)>,
153 filter: &[PathBuf],
154 ) -> (
155 Vec<PathBuf>,
156 HashMap<PathBuf, ObjectHash>,
157 HashMap<PathBuf, ObjectHash>,
158 ) {
159 let old_blobs_map: HashMap<PathBuf, ObjectHash> = old_blobs.into_iter().collect();
160 let new_blobs_map: HashMap<PathBuf, ObjectHash> = new_blobs.into_iter().collect();
161 let union_files: HashSet<PathBuf> = old_blobs_map
163 .keys()
164 .chain(new_blobs_map.keys())
165 .cloned()
166 .collect();
167
168 let processed_files: Vec<PathBuf> = union_files
170 .into_iter()
171 .filter(|file| Self::should_process(file, filter, &old_blobs_map, &new_blobs_map))
172 .collect();
173
174 (processed_files, old_blobs_map, new_blobs_map)
175 }
176
177 fn should_process(
178 file: &PathBuf,
179 filter: &[PathBuf],
180 old_blobs: &HashMap<PathBuf, ObjectHash>,
181 new_blobs: &HashMap<PathBuf, ObjectHash>,
182 ) -> bool {
183 if !filter.is_empty()
184 && !filter
185 .iter()
186 .any(|path| Self::sub_of(file, path).unwrap_or(false))
187 {
188 return false;
189 }
190
191 old_blobs.get(file) != new_blobs.get(file)
192 }
193
194 fn sub_of(path: &PathBuf, parent: &PathBuf) -> Result<bool, std::io::Error> {
195 let path_abs: PathBuf = path.absolutize()?.to_path_buf();
196 let parent_abs: PathBuf = parent.absolutize()?.to_path_buf();
197 Ok(path_abs.starts_with(parent_abs))
198 }
199
200 fn short_hash(hash: Option<&ObjectHash>) -> String {
201 hash.map(|h| {
202 let hex = h.to_string();
203 let take = Self::SHORT_HASH_LEN.min(hex.len());
204 hex[..take].to_string()
205 })
206 .unwrap_or_else(|| "0".repeat(Self::SHORT_HASH_LEN))
207 }
208
209 pub fn diff_for_file_string(
211 file: &PathBuf,
212 old_blobs: &HashMap<PathBuf, ObjectHash>,
213 new_blobs: &HashMap<PathBuf, ObjectHash>,
214 read_content: &dyn Fn(&PathBuf, &ObjectHash) -> Vec<u8>,
215 ) -> String {
216 let new_hash = new_blobs.get(file);
217 let old_hash = old_blobs.get(file);
218 let old_bytes = old_hash.map_or_else(Vec::new, |h| read_content(file, h));
219 let new_bytes = new_hash.map_or_else(Vec::new, |h| read_content(file, h));
220
221 Self::diff_for_file_preloaded(file, old_hash, new_hash, &old_bytes, &new_bytes)
222 }
223
224 fn diff_for_file_preloaded(
226 file: &Path,
227 old_hash: Option<&ObjectHash>,
228 new_hash: Option<&ObjectHash>,
229 old_bytes: &[u8],
230 new_bytes: &[u8],
231 ) -> String {
232 let mut out = String::new();
233
234 let _ = writeln!(out, "diff --git a/{} b/{}", file.display(), file.display());
236
237 if old_hash.is_none() {
238 let _ = writeln!(out, "new file mode 100644");
239 } else if new_hash.is_none() {
240 let _ = writeln!(out, "deleted file mode 100644");
241 }
242
243 let old_index = Self::short_hash(old_hash);
244 let new_index = Self::short_hash(new_hash);
245 let _ = writeln!(out, "index {old_index}..{new_index} 100644");
246
247 match (
248 std::str::from_utf8(old_bytes),
249 std::str::from_utf8(new_bytes),
250 ) {
251 (Ok(old_text), Ok(new_text)) => {
252 let (old_pref, new_pref) = if old_text.is_empty() {
253 ("/dev/null".to_string(), format!("b/{}", file.display()))
254 } else if new_text.is_empty() {
255 (format!("a/{}", file.display()), "/dev/null".to_string())
256 } else {
257 (
258 format!("a/{}", file.display()),
259 format!("b/{}", file.display()),
260 )
261 };
262
263 let _ = writeln!(out, "--- {old_pref}");
264 let _ = writeln!(out, "+++ {new_pref}");
265
266 let unified = Self::compute_unified_diff(old_text, new_text, 3);
267 out.push_str(&unified);
268 }
269 _ => {
270 let _ = writeln!(out, "Binary files differ");
271 }
272 }
273
274 out
275 }
276
277 fn compute_unified_diff(old_text: &str, new_text: &str, context: usize) -> String {
279 let diff = TextDiff::configure()
281 .algorithm(Algorithm::Myers)
282 .diff_lines(old_text, new_text);
283
284 let mut out = String::with_capacity(((old_text.len() + new_text.len()) / 16).max(4096));
286
287 let mut prefix_ctx: VecDeque<EditLine> = VecDeque::with_capacity(context);
289 let mut cur_hunk: Vec<EditLine> = Vec::new();
290 let mut eq_run: Vec<EditLine> = Vec::new(); let mut in_hunk = false;
292
293 let mut last_old_seen = 0usize;
294 let mut last_new_seen = 0usize;
295 let mut old_line_no = 1usize;
296 let mut new_line_no = 1usize;
297
298 for change in diff.iter_all_changes() {
299 let line = change.value().trim_end_matches(['\r', '\n']);
300 match change.tag() {
301 ChangeTag::Equal => {
302 let entry = EditLine::Context(Some(old_line_no), Some(new_line_no), line);
303 old_line_no += 1;
304 new_line_no += 1;
305 if in_hunk {
306 eq_run.push(entry);
307 if eq_run.len() > context * 2 {
309 Self::flush_hunk_to_out(
310 &mut out,
311 &mut cur_hunk,
312 &mut eq_run,
313 &mut prefix_ctx,
314 context,
315 &mut last_old_seen,
316 &mut last_new_seen,
317 );
318 in_hunk = false;
319 }
320 } else {
321 if prefix_ctx.len() == context {
322 prefix_ctx.pop_front();
323 }
324 prefix_ctx.push_back(entry);
325 }
326 }
327 ChangeTag::Delete => {
328 let entry = EditLine::Delete(old_line_no, line);
329 old_line_no += 1;
330 if !in_hunk {
331 cur_hunk.extend(prefix_ctx.iter().copied());
332 prefix_ctx.clear();
333 in_hunk = true;
334 }
335 if !eq_run.is_empty() {
336 cur_hunk.append(&mut eq_run);
337 }
338 cur_hunk.push(entry);
339 }
340 ChangeTag::Insert => {
341 let entry = EditLine::Insert(new_line_no, line);
342 new_line_no += 1;
343 if !in_hunk {
344 cur_hunk.extend(prefix_ctx.iter().copied());
345 prefix_ctx.clear();
346 in_hunk = true;
347 }
348 if !eq_run.is_empty() {
349 cur_hunk.append(&mut eq_run);
350 }
351 cur_hunk.push(entry);
352 }
353 }
354 }
355
356 if in_hunk {
357 Self::flush_hunk_to_out(
358 &mut out,
359 &mut cur_hunk,
360 &mut eq_run,
361 &mut prefix_ctx,
362 context,
363 &mut last_old_seen,
364 &mut last_new_seen,
365 );
366 }
367
368 out
369 }
370
371 fn flush_hunk_to_out<'a>(
373 out: &mut String,
374 cur_hunk: &mut Vec<EditLine<'a>>,
375 eq_run: &mut Vec<EditLine<'a>>,
376 prefix_ctx: &mut VecDeque<EditLine<'a>>,
377 context: usize,
378 last_old_seen: &mut usize,
379 last_new_seen: &mut usize,
380 ) {
381 let trail_to_take = eq_run.len().min(context);
383 for entry in eq_run.iter().take(trail_to_take) {
384 cur_hunk.push(*entry);
385 }
386
387 let mut old_first: Option<usize> = None;
389 let mut old_count: usize = 0;
390 let mut new_first: Option<usize> = None;
391 let mut new_count: usize = 0;
392
393 for e in cur_hunk.iter() {
394 match *e {
395 EditLine::Context(o, n, _) => {
396 if let Some(o) = o {
397 if old_first.is_none() {
398 old_first = Some(o);
399 }
400 old_count += 1;
401 }
402 if let Some(n) = n {
403 if new_first.is_none() {
404 new_first = Some(n);
405 }
406 new_count += 1;
407 }
408 }
409 EditLine::Delete(o, _) => {
410 if old_first.is_none() {
411 old_first = Some(o);
412 }
413 old_count += 1;
414 }
415 EditLine::Insert(n, _) => {
416 if new_first.is_none() {
417 new_first = Some(n);
418 }
419 new_count += 1;
420 }
421 }
422 }
423
424 if old_count == 0 && new_count == 0 {
425 cur_hunk.clear();
426 eq_run.clear();
427 return;
428 }
429
430 let old_start = old_first.unwrap_or(*last_old_seen + 1);
431 let new_start = new_first.unwrap_or(*last_new_seen + 1);
432
433 let _ = writeln!(
434 out,
435 "@@ -{},{} +{},{} @@",
436 old_start, old_count, new_start, new_count
437 );
438
439 for &e in cur_hunk.iter() {
441 match e {
442 EditLine::Context(o, n, txt) => {
443 let _ = writeln!(out, " {txt}");
444 if let Some(o) = o {
445 *last_old_seen = (*last_old_seen).max(o);
446 }
447 if let Some(n) = n {
448 *last_new_seen = (*last_new_seen).max(n);
449 }
450 }
451 EditLine::Delete(o, txt) => {
452 let _ = writeln!(out, "-{txt}");
453 *last_old_seen = (*last_old_seen).max(o);
454 }
455 EditLine::Insert(n, txt) => {
456 let _ = writeln!(out, "+{txt}");
457 *last_new_seen = (*last_new_seen).max(n);
458 }
459 }
460 }
461
462 prefix_ctx.clear();
464 if context > 0 {
465 let keep_start = eq_run.len().saturating_sub(context);
466 for entry in eq_run.iter().skip(keep_start) {
467 prefix_ctx.push_back(*entry);
468 }
469 }
470
471 cur_hunk.clear();
472 eq_run.clear();
473 }
474}
475
476pub fn compute_diff(old_lines: &[String], new_lines: &[String]) -> Vec<DiffOperation> {
478 Diff::compute_line_operations(old_lines, new_lines)
479}
480
481#[cfg(test)]
482mod tests {
483 use super::{Diff, DiffOperation, compute_diff};
484 use crate::hash::{HashKind, ObjectHash, set_hash_kind_for_test};
485 use std::collections::HashMap;
486 use std::fs;
487 use std::path::PathBuf;
488 use std::process::Command;
489 use tempfile::tempdir;
490
491 fn run_diff(
492 logical_path: &str,
493 old_bytes: &[u8],
494 new_bytes: &[u8],
495 ) -> (String, ObjectHash, ObjectHash) {
496 let file = PathBuf::from(logical_path);
497 let old_hash = ObjectHash::new(old_bytes);
498 let new_hash = ObjectHash::new(new_bytes);
499
500 let mut blob_store: HashMap<ObjectHash, Vec<u8>> = HashMap::new();
501 blob_store.insert(old_hash, old_bytes.to_vec());
502 blob_store.insert(new_hash, new_bytes.to_vec());
503
504 let mut old_map = HashMap::new();
505 let mut new_map = HashMap::new();
506 old_map.insert(file.clone(), old_hash);
507 new_map.insert(file.clone(), new_hash);
508
509 let reader = |_: &PathBuf, h: &ObjectHash| -> Vec<u8> {
510 blob_store.get(h).cloned().unwrap_or_default()
511 };
512
513 let diff = Diff::diff_for_file_string(&file, &old_map, &new_map, &reader);
514 (diff, old_hash, new_hash)
515 }
516
517 fn short_hash(hash: &ObjectHash) -> String {
518 hash.to_string().chars().take(7).collect()
519 }
520
521 fn normalized_git_diff(
522 logical_path: &str,
523 old_bytes: &[u8],
524 new_bytes: &[u8],
525 old_hash: &ObjectHash,
526 new_hash: &ObjectHash,
527 ) -> Option<String> {
528 let temp_dir = tempdir().ok()?;
529 let old_file = temp_dir.path().join("old.txt");
530 let new_file = temp_dir.path().join("new.txt");
531
532 fs::write(&old_file, old_bytes).ok()?;
533 fs::write(&new_file, new_bytes).ok()?;
534
535 let output = Command::new("git")
536 .current_dir(temp_dir.path())
537 .args(["diff", "--no-index", "--unified=3", "old.txt", "new.txt"])
538 .output()
539 .ok()?;
540
541 let stdout = String::from_utf8_lossy(&output.stdout);
542 if stdout.is_empty() {
543 return None;
544 }
545
546 let short_old = short_hash(old_hash);
547 let short_new = short_hash(new_hash);
548
549 let mut normalized = Vec::new();
550 for line in stdout.lines() {
551 let rewritten = if line.starts_with("diff --git ") {
552 format!("diff --git a/{logical_path} b/{logical_path}")
553 } else if line.starts_with("index ") {
554 format!("index {short_old}..{short_new} 100644")
555 } else if line.starts_with("--- ") {
556 format!("--- a/{logical_path}")
557 } else if line.starts_with("+++ ") {
558 format!("+++ b/{logical_path}")
559 } else if line.starts_with("@@") {
560 match line.rfind("@@") {
561 Some(pos) if pos + 2 <= line.len() => line[..pos + 2].to_string(),
562 _ => line.to_string(),
563 }
564 } else {
565 line.to_string()
566 };
567 normalized.push(rewritten);
568 }
569
570 Some(normalized.join("\n") + "\n")
571 }
572
573 #[test]
574 fn unified_diff_basic_changes() {
575 let _guard = set_hash_kind_for_test(HashKind::Sha256);
576 let old = b"a\nb\nc\n" as &[u8];
577 let new = b"a\nB\nc\nd\n" as &[u8];
578 let (diff, _, _) = run_diff("foo.txt", old, new);
579
580 assert!(diff.contains("diff --git a/foo.txt b/foo.txt"));
581 assert!(diff.contains("index "));
582 assert!(diff.contains("--- a/foo.txt"));
583 assert!(diff.contains("+++ b/foo.txt"));
584 assert!(diff.contains("@@"));
585 assert!(diff.contains("-b"));
586 assert!(diff.contains("+B"));
587 assert!(diff.contains("+d"));
588 }
589
590 #[test]
591 fn binary_files_detection() {
592 let _guard = set_hash_kind_for_test(HashKind::Sha256);
593 let old_bytes = vec![0u8, 159, 146, 150];
594 let new_bytes = vec![0xFF, 0x00, 0x01];
595 let (diff, _, _) = run_diff("bin.dat", &old_bytes, &new_bytes);
596 assert!(diff.contains("Binary files differ"));
597 }
598
599 #[test]
600 fn diff_matches_git_for_fixture() {
601 let _guard = set_hash_kind_for_test(HashKind::Sha256); let base: PathBuf = [env!("CARGO_MANIFEST_DIR"), "tests", "diff"]
603 .iter()
604 .collect();
605 let old_bytes = fs::read(base.join("old.txt")).expect("read old.txt");
606 let new_bytes = fs::read(base.join("new.txt")).expect("read new.txt");
607
608 let (diff_output, old_hash, new_hash) = run_diff("fixture.txt", &old_bytes, &new_bytes);
609 let git_output =
610 normalized_git_diff("fixture.txt", &old_bytes, &new_bytes, &old_hash, &new_hash)
611 .expect("git diff output");
612
613 fn collect(s: &str, prefix: char) -> Vec<String> {
614 s.lines()
615 .filter(|l| l.starts_with(prefix))
616 .map(|l| l.to_string())
617 .collect()
618 }
619 let ours_del = collect(&diff_output, '-');
620 let ours_ins = collect(&diff_output, '+');
621 let git_del = collect(&git_output, '-');
622 let git_ins = collect(&git_output, '+');
623
624 use std::collections::HashSet;
625 let ours_del_set: HashSet<_> = ours_del.iter().collect();
626 let git_del_set: HashSet<_> = git_del.iter().collect();
627 let ours_ins_set: HashSet<_> = ours_ins.iter().collect();
628 let git_ins_set: HashSet<_> = git_ins.iter().collect();
629
630 assert_eq!(
631 ours_del_set, git_del_set,
632 "deleted lines differ from git output"
633 );
634 assert_eq!(
635 ours_ins_set, git_ins_set,
636 "inserted lines differ from git output"
637 );
638 }
639
640 #[test]
641 fn diff_matches_git_for_large_change() {
642 let _guard = set_hash_kind_for_test(HashKind::Sha256);
643 let old_lines: Vec<String> = (0..5_000).map(|i| format!("line {i}")).collect();
644 let mut new_lines = old_lines.clone();
645 for idx in [10, 499, 1_234, 3_210, 4_999] {
646 new_lines[idx] = format!("updated line {idx}");
647 }
648 new_lines.insert(2_500, "inserted middle line".into());
649 new_lines.push("new tail line".into());
650
651 let old_text = old_lines.join("\n") + "\n";
652 let new_text = new_lines.join("\n") + "\n";
653
654 let (diff_output, old_hash, new_hash) = run_diff(
655 "large_fixture.txt",
656 old_text.as_bytes(),
657 new_text.as_bytes(),
658 );
659 let git_output = normalized_git_diff(
660 "large_fixture.txt",
661 old_text.as_bytes(),
662 new_text.as_bytes(),
663 &old_hash,
664 &new_hash,
665 )
666 .expect("git diff output");
667
668 fn collect(s: &str, prefix: char) -> Vec<String> {
669 s.lines()
670 .filter(|l| l.starts_with(prefix))
671 .map(|l| l.to_string())
672 .collect()
673 }
674 use std::collections::HashSet;
675 let ours_del: HashSet<_> = collect(&diff_output, '-').into_iter().collect();
676 let ours_ins: HashSet<_> = collect(&diff_output, '+').into_iter().collect();
677 let git_del: HashSet<_> = collect(&git_output, '-').into_iter().collect();
678 let git_ins: HashSet<_> = collect(&git_output, '+').into_iter().collect();
679 assert_eq!(ours_del, git_del, "deleted lines differ from git output");
680 assert_eq!(ours_ins, git_ins, "inserted lines differ from git output");
681 }
682
683 #[test]
684 fn compute_diff_operations_basic_mapping() {
685 let _guard = set_hash_kind_for_test(HashKind::Sha256);
686 let old_lines = vec!["a".to_string(), "b".to_string(), "c".to_string()];
687 let new_lines = vec![
688 "a".to_string(),
689 "B".to_string(),
690 "c".to_string(),
691 "d".to_string(),
692 ];
693
694 let ops = compute_diff(&old_lines, &new_lines);
695
696 let expected = vec![
697 DiffOperation::Equal {
698 old_line: 1,
699 new_line: 1,
700 },
701 DiffOperation::Delete { line: 2 },
702 DiffOperation::Insert {
703 line: 2,
704 content: "B".to_string(),
705 },
706 DiffOperation::Equal {
707 old_line: 3,
708 new_line: 3,
709 },
710 DiffOperation::Insert {
711 line: 4,
712 content: "d".to_string(),
713 },
714 ];
715
716 assert_eq!(ops, expected);
717 }
718}