1use crate::hash::SHA1;
2use path_absolutize::Absolutize;
3use similar::{Algorithm, ChangeTag, TextDiff};
4use std::collections::{HashMap, HashSet, VecDeque};
5use std::fmt::Write;
6use std::path::{Path, PathBuf};
7
8#[derive(Debug, Clone)]
12pub struct DiffItem {
13 pub path: String,
15 pub data: String,
17}
18
19pub struct Diff;
20
21#[derive(Debug, Clone, PartialEq, Eq)]
23pub enum DiffOperation {
24 Insert { line: usize, content: String },
25 Delete { line: usize },
26 Equal { old_line: usize, new_line: usize },
27}
28
29#[derive(Debug, Clone, Copy)]
30enum EditLine<'a> {
31 Context(Option<usize>, Option<usize>, &'a str),
33 Delete(usize, &'a str),
35 Insert(usize, &'a str),
37}
38
39impl Diff {
40 fn compute_line_operations(old_lines: &[String], new_lines: &[String]) -> Vec<DiffOperation> {
41 if old_lines.is_empty() && new_lines.is_empty() {
42 return Vec::new();
43 }
44
45 let old_refs: Vec<&str> = old_lines.iter().map(|s| s.as_str()).collect();
46 let new_refs: Vec<&str> = new_lines.iter().map(|s| s.as_str()).collect();
47
48 let diff = TextDiff::configure()
49 .algorithm(Algorithm::Myers)
50 .diff_slices(&old_refs, &new_refs);
51
52 let mut operations = Vec::with_capacity(old_lines.len() + new_lines.len());
53 let mut old_line_no = 1usize;
54 let mut new_line_no = 1usize;
55
56 for change in diff.iter_all_changes() {
57 match change.tag() {
58 ChangeTag::Equal => {
59 operations.push(DiffOperation::Equal {
60 old_line: old_line_no,
61 new_line: new_line_no,
62 });
63 old_line_no += 1;
64 new_line_no += 1;
65 }
66 ChangeTag::Delete => {
67 operations.push(DiffOperation::Delete { line: old_line_no });
68 old_line_no += 1;
69 }
70 ChangeTag::Insert => {
71 operations.push(DiffOperation::Insert {
72 line: new_line_no,
73 content: change.value().to_string(),
74 });
75 new_line_no += 1;
76 }
77 }
78 }
79
80 operations
81 }
82
83 const MAX_DIFF_LINES: usize = 10_000; const LARGE_FILE_MARKER: &'static str = "<LargeFile>";
85 const LARGE_FILE_END: &'static str = "</LargeFile>";
86 const SHORT_HASH_LEN: usize = 7;
87
88 pub fn diff<F>(
90 old_blobs: Vec<(PathBuf, SHA1)>,
91 new_blobs: Vec<(PathBuf, SHA1)>,
92 filter: Vec<PathBuf>,
93 read_content: F,
94 ) -> Vec<DiffItem>
95 where
96 F: Fn(&PathBuf, &SHA1) -> Vec<u8>,
97 {
98 let (processed_files, old_blobs_map, new_blobs_map) =
99 Self::prepare_diff_data(old_blobs, new_blobs, &filter);
100
101 let mut diff_results: Vec<DiffItem> = Vec::with_capacity(processed_files.len());
102 for file in processed_files {
103 let old_hash = old_blobs_map.get(&file);
105 let new_hash = new_blobs_map.get(&file);
106 let old_bytes = old_hash.map_or_else(Vec::new, |h| read_content(&file, h));
107 let new_bytes = new_hash.map_or_else(Vec::new, |h| read_content(&file, h));
108
109 if let Some(large_file_marker) =
110 Self::is_large_file_bytes(&file, &old_bytes, &new_bytes)
111 {
112 diff_results.push(DiffItem {
113 path: file.to_string_lossy().to_string(),
114 data: large_file_marker,
115 });
116 } else {
117 let diff = Self::diff_for_file_preloaded(
118 &file, old_hash, new_hash, &old_bytes, &new_bytes,
119 );
120 diff_results.push(DiffItem {
121 path: file.to_string_lossy().to_string(),
122 data: diff,
123 });
124 }
125 }
126
127 diff_results
128 }
129
130 fn is_large_file_bytes(file: &Path, old_bytes: &[u8], new_bytes: &[u8]) -> Option<String> {
132 let old_lines = String::from_utf8_lossy(old_bytes).lines().count();
133 let new_lines = String::from_utf8_lossy(new_bytes).lines().count();
134 let total_lines = old_lines + new_lines;
135 if total_lines > Self::MAX_DIFF_LINES {
136 Some(format!(
137 "{}{}:{}:{}{}\n",
138 Self::LARGE_FILE_MARKER,
139 file.display(),
140 total_lines,
141 Self::MAX_DIFF_LINES,
142 Self::LARGE_FILE_END
143 ))
144 } else {
145 None
146 }
147 }
148
149 fn prepare_diff_data(
151 old_blobs: Vec<(PathBuf, SHA1)>,
152 new_blobs: Vec<(PathBuf, SHA1)>,
153 filter: &[PathBuf],
154 ) -> (Vec<PathBuf>, HashMap<PathBuf, SHA1>, HashMap<PathBuf, SHA1>) {
155 let old_blobs_map: HashMap<PathBuf, SHA1> = old_blobs.into_iter().collect();
156 let new_blobs_map: HashMap<PathBuf, SHA1> = new_blobs.into_iter().collect();
157
158 let union_files: HashSet<PathBuf> = old_blobs_map
160 .keys()
161 .chain(new_blobs_map.keys())
162 .cloned()
163 .collect();
164
165 let processed_files: Vec<PathBuf> = union_files
167 .into_iter()
168 .filter(|file| Self::should_process(file, filter, &old_blobs_map, &new_blobs_map))
169 .collect();
170
171 (processed_files, old_blobs_map, new_blobs_map)
172 }
173
174 fn should_process(
175 file: &PathBuf,
176 filter: &[PathBuf],
177 old_blobs: &HashMap<PathBuf, SHA1>,
178 new_blobs: &HashMap<PathBuf, SHA1>,
179 ) -> bool {
180 if !filter.is_empty()
181 && !filter
182 .iter()
183 .any(|path| Self::sub_of(file, path).unwrap_or(false))
184 {
185 return false;
186 }
187
188 old_blobs.get(file) != new_blobs.get(file)
189 }
190
191 fn sub_of(path: &PathBuf, parent: &PathBuf) -> Result<bool, std::io::Error> {
192 let path_abs: PathBuf = path.absolutize()?.to_path_buf();
193 let parent_abs: PathBuf = parent.absolutize()?.to_path_buf();
194 Ok(path_abs.starts_with(parent_abs))
195 }
196
197 fn short_hash(hash: Option<&SHA1>) -> String {
198 hash.map(|h| {
199 let hex = h.to_string();
200 let take = Self::SHORT_HASH_LEN.min(hex.len());
201 hex[..take].to_string()
202 })
203 .unwrap_or_else(|| "0".repeat(Self::SHORT_HASH_LEN))
204 }
205
206 pub fn diff_for_file_string(
208 file: &PathBuf,
209 old_blobs: &HashMap<PathBuf, SHA1>,
210 new_blobs: &HashMap<PathBuf, SHA1>,
211 read_content: &dyn Fn(&PathBuf, &SHA1) -> Vec<u8>,
212 ) -> String {
213 let new_hash = new_blobs.get(file);
214 let old_hash = old_blobs.get(file);
215 let old_bytes = old_hash.map_or_else(Vec::new, |h| read_content(file, h));
216 let new_bytes = new_hash.map_or_else(Vec::new, |h| read_content(file, h));
217
218 Self::diff_for_file_preloaded(file, old_hash, new_hash, &old_bytes, &new_bytes)
219 }
220
221 fn diff_for_file_preloaded(
223 file: &Path,
224 old_hash: Option<&SHA1>,
225 new_hash: Option<&SHA1>,
226 old_bytes: &[u8],
227 new_bytes: &[u8],
228 ) -> String {
229 let mut out = String::new();
230
231 let _ = writeln!(out, "diff --git a/{} b/{}", file.display(), file.display());
233
234 if old_hash.is_none() {
235 let _ = writeln!(out, "new file mode 100644");
236 } else if new_hash.is_none() {
237 let _ = writeln!(out, "deleted file mode 100644");
238 }
239
240 let old_index = Self::short_hash(old_hash);
241 let new_index = Self::short_hash(new_hash);
242 let _ = writeln!(out, "index {old_index}..{new_index} 100644");
243
244 match (
245 std::str::from_utf8(old_bytes),
246 std::str::from_utf8(new_bytes),
247 ) {
248 (Ok(old_text), Ok(new_text)) => {
249 let (old_pref, new_pref) = if old_text.is_empty() {
250 ("/dev/null".to_string(), format!("b/{}", file.display()))
251 } else if new_text.is_empty() {
252 (format!("a/{}", file.display()), "/dev/null".to_string())
253 } else {
254 (
255 format!("a/{}", file.display()),
256 format!("b/{}", file.display()),
257 )
258 };
259
260 let _ = writeln!(out, "--- {old_pref}");
261 let _ = writeln!(out, "+++ {new_pref}");
262
263 let unified = Self::compute_unified_diff(old_text, new_text, 3);
264 out.push_str(&unified);
265 }
266 _ => {
267 let _ = writeln!(out, "Binary files differ");
268 }
269 }
270
271 out
272 }
273
274 fn compute_unified_diff(old_text: &str, new_text: &str, context: usize) -> String {
276 let diff = TextDiff::configure()
278 .algorithm(Algorithm::Myers)
279 .diff_lines(old_text, new_text);
280
281 let mut out = String::with_capacity(((old_text.len() + new_text.len()) / 16).max(4096));
283
284 let mut prefix_ctx: VecDeque<EditLine> = VecDeque::with_capacity(context);
286 let mut cur_hunk: Vec<EditLine> = Vec::new();
287 let mut eq_run: Vec<EditLine> = Vec::new(); let mut in_hunk = false;
289
290 let mut last_old_seen = 0usize;
291 let mut last_new_seen = 0usize;
292 let mut old_line_no = 1usize;
293 let mut new_line_no = 1usize;
294
295 for change in diff.iter_all_changes() {
296 let line = change.value().trim_end_matches(['\r', '\n']);
297 match change.tag() {
298 ChangeTag::Equal => {
299 let entry = EditLine::Context(Some(old_line_no), Some(new_line_no), line);
300 old_line_no += 1;
301 new_line_no += 1;
302 if in_hunk {
303 eq_run.push(entry);
304 if eq_run.len() > context * 2 {
306 Self::flush_hunk_to_out(
307 &mut out,
308 &mut cur_hunk,
309 &mut eq_run,
310 &mut prefix_ctx,
311 context,
312 &mut last_old_seen,
313 &mut last_new_seen,
314 );
315 in_hunk = false;
316 }
317 } else {
318 if prefix_ctx.len() == context {
319 prefix_ctx.pop_front();
320 }
321 prefix_ctx.push_back(entry);
322 }
323 }
324 ChangeTag::Delete => {
325 let entry = EditLine::Delete(old_line_no, line);
326 old_line_no += 1;
327 if !in_hunk {
328 cur_hunk.extend(prefix_ctx.iter().copied());
329 prefix_ctx.clear();
330 in_hunk = true;
331 }
332 if !eq_run.is_empty() {
333 cur_hunk.append(&mut eq_run);
334 }
335 cur_hunk.push(entry);
336 }
337 ChangeTag::Insert => {
338 let entry = EditLine::Insert(new_line_no, line);
339 new_line_no += 1;
340 if !in_hunk {
341 cur_hunk.extend(prefix_ctx.iter().copied());
342 prefix_ctx.clear();
343 in_hunk = true;
344 }
345 if !eq_run.is_empty() {
346 cur_hunk.append(&mut eq_run);
347 }
348 cur_hunk.push(entry);
349 }
350 }
351 }
352
353 if in_hunk {
354 Self::flush_hunk_to_out(
355 &mut out,
356 &mut cur_hunk,
357 &mut eq_run,
358 &mut prefix_ctx,
359 context,
360 &mut last_old_seen,
361 &mut last_new_seen,
362 );
363 }
364
365 out
366 }
367
368 fn flush_hunk_to_out<'a>(
370 out: &mut String,
371 cur_hunk: &mut Vec<EditLine<'a>>,
372 eq_run: &mut Vec<EditLine<'a>>,
373 prefix_ctx: &mut VecDeque<EditLine<'a>>,
374 context: usize,
375 last_old_seen: &mut usize,
376 last_new_seen: &mut usize,
377 ) {
378 let trail_to_take = eq_run.len().min(context);
380 for entry in eq_run.iter().take(trail_to_take) {
381 cur_hunk.push(*entry);
382 }
383
384 let mut old_first: Option<usize> = None;
386 let mut old_count: usize = 0;
387 let mut new_first: Option<usize> = None;
388 let mut new_count: usize = 0;
389
390 for e in cur_hunk.iter() {
391 match *e {
392 EditLine::Context(o, n, _) => {
393 if let Some(o) = o {
394 if old_first.is_none() {
395 old_first = Some(o);
396 }
397 old_count += 1;
398 }
399 if let Some(n) = n {
400 if new_first.is_none() {
401 new_first = Some(n);
402 }
403 new_count += 1;
404 }
405 }
406 EditLine::Delete(o, _) => {
407 if old_first.is_none() {
408 old_first = Some(o);
409 }
410 old_count += 1;
411 }
412 EditLine::Insert(n, _) => {
413 if new_first.is_none() {
414 new_first = Some(n);
415 }
416 new_count += 1;
417 }
418 }
419 }
420
421 if old_count == 0 && new_count == 0 {
422 cur_hunk.clear();
423 eq_run.clear();
424 return;
425 }
426
427 let old_start = old_first.unwrap_or(*last_old_seen + 1);
428 let new_start = new_first.unwrap_or(*last_new_seen + 1);
429
430 let _ = writeln!(
431 out,
432 "@@ -{},{} +{},{} @@",
433 old_start, old_count, new_start, new_count
434 );
435
436 for &e in cur_hunk.iter() {
438 match e {
439 EditLine::Context(o, n, txt) => {
440 let _ = writeln!(out, " {txt}");
441 if let Some(o) = o {
442 *last_old_seen = (*last_old_seen).max(o);
443 }
444 if let Some(n) = n {
445 *last_new_seen = (*last_new_seen).max(n);
446 }
447 }
448 EditLine::Delete(o, txt) => {
449 let _ = writeln!(out, "-{txt}");
450 *last_old_seen = (*last_old_seen).max(o);
451 }
452 EditLine::Insert(n, txt) => {
453 let _ = writeln!(out, "+{txt}");
454 *last_new_seen = (*last_new_seen).max(n);
455 }
456 }
457 }
458
459 prefix_ctx.clear();
461 if context > 0 {
462 let keep_start = eq_run.len().saturating_sub(context);
463 for entry in eq_run.iter().skip(keep_start) {
464 prefix_ctx.push_back(*entry);
465 }
466 }
467
468 cur_hunk.clear();
469 eq_run.clear();
470 }
471}
472
473pub fn compute_diff(old_lines: &[String], new_lines: &[String]) -> Vec<DiffOperation> {
475 Diff::compute_line_operations(old_lines, new_lines)
476}
477
478#[cfg(test)]
479mod tests {
480 use super::{Diff, DiffOperation, compute_diff};
481 use crate::hash::SHA1;
482 use std::collections::HashMap;
483 use std::fs;
484 use std::path::PathBuf;
485 use std::process::Command;
486 use tempfile::tempdir;
487
488 fn run_diff(logical_path: &str, old_bytes: &[u8], new_bytes: &[u8]) -> (String, SHA1, SHA1) {
489 let file = PathBuf::from(logical_path);
490 let old_hash = SHA1::new(old_bytes);
491 let new_hash = SHA1::new(new_bytes);
492
493 let mut blob_store: HashMap<SHA1, Vec<u8>> = HashMap::new();
494 blob_store.insert(old_hash, old_bytes.to_vec());
495 blob_store.insert(new_hash, new_bytes.to_vec());
496
497 let mut old_map = HashMap::new();
498 let mut new_map = HashMap::new();
499 old_map.insert(file.clone(), old_hash);
500 new_map.insert(file.clone(), new_hash);
501
502 let reader =
503 |_: &PathBuf, h: &SHA1| -> Vec<u8> { blob_store.get(h).cloned().unwrap_or_default() };
504
505 let diff = Diff::diff_for_file_string(&file, &old_map, &new_map, &reader);
506 (diff, old_hash, new_hash)
507 }
508
509 fn short_hash(hash: &SHA1) -> String {
510 hash.to_string().chars().take(7).collect()
511 }
512
513 fn normalized_git_diff(
514 logical_path: &str,
515 old_bytes: &[u8],
516 new_bytes: &[u8],
517 old_hash: &SHA1,
518 new_hash: &SHA1,
519 ) -> Option<String> {
520 let temp_dir = tempdir().ok()?;
521 let old_file = temp_dir.path().join("old.txt");
522 let new_file = temp_dir.path().join("new.txt");
523
524 fs::write(&old_file, old_bytes).ok()?;
525 fs::write(&new_file, new_bytes).ok()?;
526
527 let output = Command::new("git")
528 .current_dir(temp_dir.path())
529 .args(["diff", "--no-index", "--unified=3", "old.txt", "new.txt"])
530 .output()
531 .ok()?;
532
533 let stdout = String::from_utf8_lossy(&output.stdout);
534 if stdout.is_empty() {
535 return None;
536 }
537
538 let short_old = short_hash(old_hash);
539 let short_new = short_hash(new_hash);
540
541 let mut normalized = Vec::new();
542 for line in stdout.lines() {
543 let rewritten = if line.starts_with("diff --git ") {
544 format!("diff --git a/{logical_path} b/{logical_path}")
545 } else if line.starts_with("index ") {
546 format!("index {short_old}..{short_new} 100644")
547 } else if line.starts_with("--- ") {
548 format!("--- a/{logical_path}")
549 } else if line.starts_with("+++ ") {
550 format!("+++ b/{logical_path}")
551 } else if line.starts_with("@@") {
552 match line.rfind("@@") {
553 Some(pos) if pos + 2 <= line.len() => line[..pos + 2].to_string(),
554 _ => line.to_string(),
555 }
556 } else {
557 line.to_string()
558 };
559 normalized.push(rewritten);
560 }
561
562 Some(normalized.join("\n") + "\n")
563 }
564
565 #[test]
566 fn unified_diff_basic_changes() {
567 let old = b"a\nb\nc\n" as &[u8];
568 let new = b"a\nB\nc\nd\n" as &[u8];
569 let (diff, _, _) = run_diff("foo.txt", old, new);
570
571 assert!(diff.contains("diff --git a/foo.txt b/foo.txt"));
572 assert!(diff.contains("index "));
573 assert!(diff.contains("--- a/foo.txt"));
574 assert!(diff.contains("+++ b/foo.txt"));
575 assert!(diff.contains("@@"));
576 assert!(diff.contains("-b"));
577 assert!(diff.contains("+B"));
578 assert!(diff.contains("+d"));
579 }
580
581 #[test]
582 fn binary_files_detection() {
583 let old_bytes = vec![0u8, 159, 146, 150];
584 let new_bytes = vec![0xFF, 0x00, 0x01];
585 let (diff, _, _) = run_diff("bin.dat", &old_bytes, &new_bytes);
586 assert!(diff.contains("Binary files differ"));
587 }
588
589 #[test]
590 fn diff_matches_git_for_fixture() {
591 let base: PathBuf = [env!("CARGO_MANIFEST_DIR"), "tests", "diff"]
592 .iter()
593 .collect();
594 let old_bytes = fs::read(base.join("old.txt")).expect("read old.txt");
595 let new_bytes = fs::read(base.join("new.txt")).expect("read new.txt");
596
597 let (diff_output, old_hash, new_hash) = run_diff("fixture.txt", &old_bytes, &new_bytes);
598 let git_output =
599 normalized_git_diff("fixture.txt", &old_bytes, &new_bytes, &old_hash, &new_hash)
600 .expect("git diff output");
601
602 fn collect(s: &str, prefix: char) -> Vec<String> {
603 s.lines()
604 .filter(|l| l.starts_with(prefix))
605 .map(|l| l.to_string())
606 .collect()
607 }
608 let ours_del = collect(&diff_output, '-');
609 let ours_ins = collect(&diff_output, '+');
610 let git_del = collect(&git_output, '-');
611 let git_ins = collect(&git_output, '+');
612
613 use std::collections::HashSet;
614 let ours_del_set: HashSet<_> = ours_del.iter().collect();
615 let git_del_set: HashSet<_> = git_del.iter().collect();
616 let ours_ins_set: HashSet<_> = ours_ins.iter().collect();
617 let git_ins_set: HashSet<_> = git_ins.iter().collect();
618
619 assert_eq!(
620 ours_del_set, git_del_set,
621 "deleted lines differ from git output"
622 );
623 assert_eq!(
624 ours_ins_set, git_ins_set,
625 "inserted lines differ from git output"
626 );
627 }
628
629 #[test]
630 fn diff_matches_git_for_large_change() {
631 let old_lines: Vec<String> = (0..5_000).map(|i| format!("line {i}")).collect();
632 let mut new_lines = old_lines.clone();
633 for idx in [10, 499, 1_234, 3_210, 4_999] {
634 new_lines[idx] = format!("updated line {idx}");
635 }
636 new_lines.insert(2_500, "inserted middle line".into());
637 new_lines.push("new tail line".into());
638
639 let old_text = old_lines.join("\n") + "\n";
640 let new_text = new_lines.join("\n") + "\n";
641
642 let (diff_output, old_hash, new_hash) = run_diff(
643 "large_fixture.txt",
644 old_text.as_bytes(),
645 new_text.as_bytes(),
646 );
647 let git_output = normalized_git_diff(
648 "large_fixture.txt",
649 old_text.as_bytes(),
650 new_text.as_bytes(),
651 &old_hash,
652 &new_hash,
653 )
654 .expect("git diff output");
655
656 fn collect(s: &str, prefix: char) -> Vec<String> {
657 s.lines()
658 .filter(|l| l.starts_with(prefix))
659 .map(|l| l.to_string())
660 .collect()
661 }
662 use std::collections::HashSet;
663 let ours_del: HashSet<_> = collect(&diff_output, '-').into_iter().collect();
664 let ours_ins: HashSet<_> = collect(&diff_output, '+').into_iter().collect();
665 let git_del: HashSet<_> = collect(&git_output, '-').into_iter().collect();
666 let git_ins: HashSet<_> = collect(&git_output, '+').into_iter().collect();
667 assert_eq!(ours_del, git_del, "deleted lines differ from git output");
668 assert_eq!(ours_ins, git_ins, "inserted lines differ from git output");
669 }
670
671 #[test]
672 fn compute_diff_operations_basic_mapping() {
673 let old_lines = vec!["a".to_string(), "b".to_string(), "c".to_string()];
674 let new_lines = vec![
675 "a".to_string(),
676 "B".to_string(),
677 "c".to_string(),
678 "d".to_string(),
679 ];
680
681 let ops = compute_diff(&old_lines, &new_lines);
682
683 let expected = vec![
684 DiffOperation::Equal {
685 old_line: 1,
686 new_line: 1,
687 },
688 DiffOperation::Delete { line: 2 },
689 DiffOperation::Insert {
690 line: 2,
691 content: "B".to_string(),
692 },
693 DiffOperation::Equal {
694 old_line: 3,
695 new_line: 3,
696 },
697 DiffOperation::Insert {
698 line: 4,
699 content: "d".to_string(),
700 },
701 ];
702
703 assert_eq!(ops, expected);
704 }
705}