1use std::ops::Range;
9
10use bstr::{BStr, ByteSlice};
11use gix_object::tree::{EntryKind, EntryMode};
12
13use crate::{
14 Rewrites,
15 blob::{DiffLineStats, ResourceKind, platform::prepare_diff::Operation},
16 rewrites::{CopySource, Outcome, Tracker, tracker::visit::SourceKind},
17 tree::visit::{Action, ChangeId, Relation},
18};
19
20#[derive(Debug, Copy, Clone, Ord, PartialOrd, PartialEq, Eq)]
22pub enum ChangeKind {
23 Deletion,
25 Modification,
27 Addition,
29}
30
31pub trait Change: Clone {
33 fn id(&self) -> &gix_hash::oid;
38 fn relation(&self) -> Option<Relation>;
49 fn kind(&self) -> ChangeKind;
51 fn entry_mode(&self) -> EntryMode;
53 fn id_and_entry_mode(&self) -> (&gix_hash::oid, EntryMode);
55}
56
57pub(crate) struct Item<T> {
59 change: T,
61 path: Range<usize>,
63 emitted: bool,
65}
66
67impl<T: Change> Item<T> {
68 fn location<'a>(&self, backing: &'a [u8]) -> &'a BStr {
69 backing[self.path.clone()].as_ref()
70 }
71 fn entry_mode_compatible(&self, other: EntryMode) -> bool {
72 use EntryKind::*;
73 matches!(
74 (other.kind(), self.change.entry_mode().kind()),
75 (Blob | BlobExecutable, Blob | BlobExecutable) | (Link, Link) | (Tree, Tree)
76 )
77 }
78
79 fn is_source_for_destination_of(&self, kind: visit::SourceKind, dest_item_mode: EntryMode) -> bool {
80 self.entry_mode_compatible(dest_item_mode)
81 && match kind {
82 visit::SourceKind::Rename => !self.emitted && matches!(self.change.kind(), ChangeKind::Deletion),
83 visit::SourceKind::Copy => {
84 matches!(self.change.kind(), ChangeKind::Modification)
85 }
86 }
87 }
88}
89
90pub mod visit {
92 use bstr::BStr;
93 use gix_object::tree::EntryMode;
94
95 use crate::blob::DiffLineStats;
96
97 #[derive(Debug, Clone, PartialEq, PartialOrd)]
99 pub struct Source<'a, T> {
100 pub entry_mode: EntryMode,
102 pub id: gix_hash::ObjectId,
104 pub kind: SourceKind,
106 pub location: &'a BStr,
108 pub change: &'a T,
110 pub diff: Option<DiffLineStats>,
112 }
113
114 #[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
116 pub enum SourceKind {
117 Rename,
119 Copy,
121 }
122
123 #[derive(Debug, Clone)]
125 pub struct Destination<'a, T: Clone> {
126 pub change: T,
128 pub location: &'a BStr,
130 }
131}
132
133pub mod emit {
135 #[derive(Debug, thiserror::Error)]
137 #[allow(missing_docs)]
138 pub enum Error {
139 #[error("Could not find blob for similarity checking")]
140 FindExistingBlob(#[from] gix_object::find::existing_object::Error),
141 #[error("Could not obtain exhaustive item set to use as possible sources for copy detection")]
142 GetItemsForExhaustiveCopyDetection(#[source] Box<dyn std::error::Error + Send + Sync>),
143 #[error(transparent)]
144 SetResource(#[from] crate::blob::platform::set_resource::Error),
145 #[error(transparent)]
146 PrepareDiff(#[from] crate::blob::platform::prepare_diff::Error),
147 }
148}
149
150impl<T: Change> Tracker<T> {
152 pub fn new(rewrites: Rewrites) -> Self {
154 Tracker {
155 items: vec![],
156 path_backing: vec![],
157 rewrites,
158 child_renames: Default::default(),
159 }
160 }
161}
162
163impl<T: Change> Tracker<T> {
165 pub fn try_push_change(&mut self, change: T, location: &BStr) -> Option<T> {
167 let change_kind = change.kind();
168 if let (None, ChangeKind::Modification) = (self.rewrites.copies, change_kind) {
169 return Some(change);
170 }
171
172 let entry_kind = change.entry_mode().kind();
173 if entry_kind == EntryKind::Commit {
174 return Some(change);
175 }
176 let relation = change
177 .relation()
178 .filter(|_| matches!(change_kind, ChangeKind::Addition | ChangeKind::Deletion));
179 if let (None, EntryKind::Tree) = (relation, entry_kind) {
180 return Some(change);
181 }
182
183 let start = self.path_backing.len();
184 self.path_backing.extend_from_slice(location);
185 let path = start..self.path_backing.len();
186
187 self.items.push(Item {
188 path,
189 change,
190 emitted: false,
191 });
192 None
193 }
194
195 pub fn emit<PushSourceTreeFn, E>(
218 &mut self,
219 mut cb: impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
220 diff_cache: &mut crate::blob::Platform,
221 objects: &impl gix_object::FindObjectOrHeader,
222 mut push_source_tree: PushSourceTreeFn,
223 ) -> Result<Outcome, emit::Error>
224 where
225 PushSourceTreeFn: FnMut(&mut dyn FnMut(T, &BStr)) -> Result<(), E>,
226 E: std::error::Error + Send + Sync + 'static,
227 {
228 fn is_parent(change: &impl Change) -> bool {
229 matches!(change.relation(), Some(Relation::Parent(_)))
230 }
231 diff_cache.options.skip_internal_diff_if_external_is_configured = false;
232
233 fn by_id_and_location<T: Change>(a: &Item<T>, b: &Item<T>) -> std::cmp::Ordering {
236 a.change
237 .id()
238 .cmp(b.change.id())
239 .then_with(|| a.path.start.cmp(&b.path.start).then(a.path.end.cmp(&b.path.end)))
240 }
241
242 let has_work = {
244 let (mut num_deletions, mut num_additions, mut num_modifications) = (0, 0, 0);
245 let mut has_work = false;
246 for change in &self.items {
247 match change.change.kind() {
248 ChangeKind::Deletion => {
249 num_deletions += 1;
250 }
251 ChangeKind::Modification => {
252 num_modifications += 1;
254 }
255 ChangeKind::Addition => num_additions += 1,
256 }
257 if (num_deletions != 0 && num_additions != 0)
258 || (self.rewrites.copies.is_some() && num_modifications + num_additions > 1)
259 {
260 has_work = true;
261 break;
262 }
263 }
264 has_work
265 };
266
267 let mut out = Outcome {
268 options: self.rewrites,
269 ..Default::default()
270 };
271 if has_work {
272 self.items.sort_by(by_id_and_location);
273
274 self.match_pairs_of_kind(
278 visit::SourceKind::Rename,
279 &mut cb,
280 None, &mut out,
282 diff_cache,
283 objects,
284 Some(is_parent),
285 )?;
286
287 self.match_pairs_of_kind(
288 visit::SourceKind::Rename,
289 &mut cb,
290 self.rewrites.percentage,
291 &mut out,
292 diff_cache,
293 objects,
294 None,
295 )?;
296
297 self.match_renamed_directories(&mut cb)?;
298
299 if let Some(copies) = self.rewrites.copies {
300 self.match_pairs_of_kind(
301 visit::SourceKind::Copy,
302 &mut cb,
303 copies.percentage,
304 &mut out,
305 diff_cache,
306 objects,
307 None,
308 )?;
309
310 match copies.source {
311 CopySource::FromSetOfModifiedFiles => {}
312 CopySource::FromSetOfModifiedFilesAndAllSources => {
313 push_source_tree(&mut |change, location| {
314 if self.try_push_change(change, location).is_none() {
315 self.items.last_mut().expect("just pushed").emitted = true;
317 }
318 })
319 .map_err(|err| emit::Error::GetItemsForExhaustiveCopyDetection(Box::new(err)))?;
320 self.items.sort_by(by_id_and_location);
321
322 self.match_pairs_of_kind(
323 visit::SourceKind::Copy,
324 &mut cb,
325 copies.percentage,
326 &mut out,
327 diff_cache,
328 objects,
329 None,
330 )?;
331 }
332 }
333 }
334 }
335
336 self.items
337 .sort_by(|a, b| a.location(&self.path_backing).cmp(b.location(&self.path_backing)));
338 for item in self.items.drain(..).filter(|item| !item.emitted) {
339 if cb(
340 visit::Destination {
341 location: item.location(&self.path_backing),
342 change: item.change,
343 },
344 None,
345 )
346 .is_break()
347 {
348 break;
349 }
350 }
351 Ok(out)
352 }
353}
354
355impl<T: Change> Tracker<T> {
356 #[allow(clippy::too_many_arguments)]
357 fn match_pairs_of_kind(
358 &mut self,
359 kind: visit::SourceKind,
360 cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
361 percentage: Option<f32>,
362 out: &mut Outcome,
363 diff_cache: &mut crate::blob::Platform,
364 objects: &impl gix_object::FindObjectOrHeader,
365 filter: Option<fn(&T) -> bool>,
366 ) -> Result<(), emit::Error> {
367 let needs_second_pass = !needs_exact_match(percentage);
369
370 if self
374 .match_pairs(cb, None , kind, out, diff_cache, objects, filter)?
375 .is_break()
376 {
377 return Ok(());
378 }
379 if needs_second_pass {
380 let is_limited = if self.rewrites.limit == 0 {
381 false
382 } else {
383 let (num_src, num_dst) =
384 estimate_involved_items(self.items.iter().map(|item| (item.emitted, item.change.kind())), kind);
385 let permutations = num_src * num_dst;
386 if permutations > self.rewrites.limit {
387 match kind {
388 visit::SourceKind::Rename => {
389 out.num_similarity_checks_skipped_for_rename_tracking_due_to_limit = permutations;
390 }
391 visit::SourceKind::Copy => {
392 out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit = permutations;
393 }
394 }
395 true
396 } else {
397 false
398 }
399 };
400 if !is_limited {
401 let _ = self.match_pairs(cb, percentage, kind, out, diff_cache, objects, None)?;
402 }
403 }
404 Ok(())
405 }
406
407 #[allow(clippy::too_many_arguments)]
408 fn match_pairs(
409 &mut self,
410 cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
411 percentage: Option<f32>,
412 kind: visit::SourceKind,
413 stats: &mut Outcome,
414 diff_cache: &mut crate::blob::Platform,
415 objects: &impl gix_object::FindObjectOrHeader,
416 filter: Option<fn(&T) -> bool>,
417 ) -> Result<Action, emit::Error> {
418 let mut dest_ofs = 0;
419 let mut num_checks = 0;
420 let max_checks = {
421 let limit = self.rewrites.limit.saturating_pow(2);
422 if self.items.len() < 100_000 { 0 } else { limit }
426 };
427
428 while let Some((mut dest_idx, dest)) = self.items[dest_ofs..].iter().enumerate().find_map(|(idx, item)| {
429 (!item.emitted
430 && matches!(item.change.kind(), ChangeKind::Addition)
431 && filter.map_or_else(
432 || {
433 self.rewrites.track_empty
434 || matches!(item.change.relation(), Some(Relation::ChildOfParent(_)))
437 || {
438 let id = item.change.id();
439 id != gix_hash::ObjectId::empty_blob(id.kind())
440 }
441 },
442 |f| f(&item.change),
443 ))
444 .then_some((idx, item))
445 }) {
446 dest_idx += dest_ofs;
447 dest_ofs = dest_idx + 1;
448 self.items[dest_idx].location(&self.path_backing);
449 let src = find_match(
450 &self.items,
451 dest,
452 dest_idx,
453 percentage,
454 kind,
455 stats,
456 objects,
457 diff_cache,
458 &self.path_backing,
459 &mut num_checks,
460 )?
461 .map(|(src_idx, src, diff)| {
462 let (id, entry_mode) = src.change.id_and_entry_mode();
463 let id = id.to_owned();
464 let location = src.location(&self.path_backing);
465 (
466 visit::Source {
467 entry_mode,
468 id,
469 kind,
470 location,
471 change: &src.change,
472 diff,
473 },
474 src_idx,
475 )
476 });
477 if max_checks != 0 && num_checks > max_checks {
478 gix_trace::warn!(
479 "Cancelled rename matching as there were too many iterations ({num_checks} > {max_checks})"
480 );
481 return Ok(std::ops::ControlFlow::Break(()));
482 }
483 let Some((src, src_idx)) = src else {
484 continue;
485 };
486 let location = dest.location(&self.path_backing);
487 let change = dest.change.clone();
488 let dest = visit::Destination { change, location };
489 let relations = if percentage.is_none() {
490 src.change.relation().zip(dest.change.relation())
491 } else {
492 None
493 };
494 let res = cb(dest, Some(src));
495
496 self.items[dest_idx].emitted = true;
497 self.items[src_idx].emitted = true;
498
499 if res.is_break() {
500 return Ok(std::ops::ControlFlow::Break(()));
501 }
502
503 match relations {
504 Some((Relation::Parent(src), Relation::Parent(dst))) => {
505 let res = self.emit_child_renames_matching_identity(cb, kind, src, dst)?;
506 if res.is_break() {
507 return Ok(std::ops::ControlFlow::Break(()));
508 }
509 }
510 Some((Relation::ChildOfParent(src), Relation::ChildOfParent(dst))) => {
511 self.child_renames.insert((src, dst));
512 }
513 _ => {}
514 }
515 }
516 Ok(std::ops::ControlFlow::Continue(()))
517 }
518
519 fn emit_child_renames_matching_identity(
523 &mut self,
524 cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
525 kind: visit::SourceKind,
526 src_parent_id: ChangeId,
527 dst_parent_id: ChangeId,
528 ) -> Result<Action, emit::Error> {
529 debug_assert_ne!(
530 src_parent_id, dst_parent_id,
531 "src and destination directories must be distinct"
532 );
533 let (mut src_items, mut dst_items) = (Vec::with_capacity(1), Vec::with_capacity(1));
534 for item in self.items.iter_mut().filter(|item| !item.emitted) {
535 match item.change.relation() {
536 Some(Relation::ChildOfParent(id)) if id == src_parent_id => {
537 src_items.push((item.change.id().to_owned(), item));
538 }
539 Some(Relation::ChildOfParent(id)) if id == dst_parent_id => {
540 dst_items.push((item.change.id().to_owned(), item));
541 }
542 _ => continue,
543 }
544 }
545
546 for ((src_id, src_item), (dst_id, dst_item)) in src_items.into_iter().zip(dst_items) {
547 if src_id == dst_id
550 && filename(src_item.location(&self.path_backing)) == filename(dst_item.location(&self.path_backing))
551 {
552 let entry_mode = src_item.change.entry_mode();
553 let location = src_item.location(&self.path_backing);
554 let src = visit::Source {
555 entry_mode,
556 id: src_id,
557 kind,
558 location,
559 change: &src_item.change,
560 diff: None,
561 };
562 let location = dst_item.location(&self.path_backing);
563 let change = dst_item.change.clone();
564 let dst = visit::Destination { change, location };
565 let res = cb(dst, Some(src));
566
567 src_item.emitted = true;
568 dst_item.emitted = true;
569
570 if res.is_break() {
571 return Ok(res);
572 }
573 } else {
574 gix_trace::warn!(
575 "Children of parents with change-id {src_parent_id} and {dst_parent_id} were not equal, even though their parents claimed to be"
576 );
577 break;
578 }
579 }
580 Ok(std::ops::ControlFlow::Continue(()))
581 }
582
583 fn match_renamed_directories(
589 &mut self,
590 cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
591 ) -> Result<(), emit::Error> {
592 fn unemitted_directory_matching_relation_id<T: Change>(items: &[Item<T>], child_id: ChangeId) -> Option<usize> {
593 items.iter().position(|i| {
594 !i.emitted && matches!(i.change.relation(), Some(Relation::Parent(pid)) if pid == child_id)
595 })
596 }
597 for (deleted_child_id, added_child_id) in &self.child_renames {
598 let Some(src_idx) = unemitted_directory_matching_relation_id(&self.items, *deleted_child_id) else {
599 continue;
600 };
601 let Some(dst_idx) = unemitted_directory_matching_relation_id(&self.items, *added_child_id) else {
602 continue;
605 };
606
607 let (src_item, dst_item) = (&self.items[src_idx], &self.items[dst_idx]);
608 let entry_mode = src_item.change.entry_mode();
609 let location = src_item.location(&self.path_backing);
610 let src = visit::Source {
611 entry_mode,
612 id: src_item.change.id().to_owned(),
613 kind: SourceKind::Rename,
614 location,
615 change: &src_item.change,
616 diff: None,
617 };
618 let location = dst_item.location(&self.path_backing);
619 let change = dst_item.change.clone();
620 let dst = visit::Destination { change, location };
621 let res = cb(dst, Some(src));
622
623 self.items[src_idx].emitted = true;
624 self.items[dst_idx].emitted = true;
625
626 if res.is_break() {
627 return Ok(());
628 }
629 }
630 Ok(())
631 }
632}
633
634fn filename(path: &BStr) -> &BStr {
635 path.rfind_byte(b'/').map_or(path, |idx| path[idx + 1..].as_bstr())
636}
637
638fn estimate_involved_items(
640 items: impl IntoIterator<Item = (bool, ChangeKind)>,
641 kind: visit::SourceKind,
642) -> (usize, usize) {
643 items
644 .into_iter()
645 .filter(|(emitted, _)| match kind {
646 visit::SourceKind::Rename => !*emitted,
647 visit::SourceKind::Copy => true,
648 })
649 .fold((0, 0), |(mut src, mut dest), (emitted, change_kind)| {
650 match change_kind {
651 ChangeKind::Addition => {
652 if kind == visit::SourceKind::Rename || !emitted {
653 dest += 1;
654 }
655 }
656 ChangeKind::Deletion => {
657 if kind == visit::SourceKind::Rename {
658 src += 1;
659 }
660 }
661 ChangeKind::Modification => {
662 if kind == visit::SourceKind::Copy {
663 src += 1;
664 }
665 }
666 }
667 (src, dest)
668 })
669}
670
671fn needs_exact_match(percentage: Option<f32>) -> bool {
672 percentage.is_none_or(|p| p >= 1.0)
673}
674
675type SourceTuple<'a, T> = (usize, &'a Item<T>, Option<DiffLineStats>);
677
678#[allow(clippy::too_many_arguments)]
686fn find_match<'a, T: Change>(
687 items: &'a [Item<T>],
688 item: &Item<T>,
689 item_idx: usize,
690 percentage: Option<f32>,
691 kind: visit::SourceKind,
692 stats: &mut Outcome,
693 objects: &impl gix_object::FindObjectOrHeader,
694 diff_cache: &mut crate::blob::Platform,
695 path_backing: &[u8],
696 num_checks: &mut usize,
697) -> Result<Option<SourceTuple<'a, T>>, emit::Error> {
698 let (item_id, item_mode) = item.change.id_and_entry_mode();
699 if needs_exact_match(percentage) || item_mode.is_link() {
700 let first_idx = items.partition_point(|a| a.change.id() < item_id);
701 let range = items.get(first_idx..).map(|slice| {
702 let end = slice
703 .iter()
704 .position(|a| a.change.id() != item_id)
705 .map_or(items.len(), |idx| first_idx + idx);
706 first_idx..end
707 });
708 let range = match range {
709 Some(range) => range,
710 None => return Ok(None),
711 };
712 if range.is_empty() {
713 return Ok(None);
714 }
715 let res = items[range.clone()].iter().enumerate().find_map(|(mut src_idx, src)| {
716 src_idx += range.start;
717 *num_checks += 1;
718 (src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)).then_some((src_idx, src, None))
719 });
720 if let Some(src) = res {
721 return Ok(Some(src));
722 }
723 } else if item_mode.is_blob() {
724 let mut has_new = false;
725 let percentage = percentage.expect("it's set to something below 1.0 and we assured this");
726
727 for (can_idx, src) in items
728 .iter()
729 .enumerate()
730 .filter(|(src_idx, src)| *src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode))
731 {
732 if !has_new {
733 diff_cache.set_resource(
734 item_id.to_owned(),
735 item_mode.kind(),
736 item.location(path_backing),
737 ResourceKind::NewOrDestination,
738 objects,
739 )?;
740 has_new = true;
741 }
742 let (src_id, src_mode) = src.change.id_and_entry_mode();
743 diff_cache.set_resource(
744 src_id.to_owned(),
745 src_mode.kind(),
746 src.location(path_backing),
747 ResourceKind::OldOrSource,
748 objects,
749 )?;
750 let prep = diff_cache.prepare_diff()?;
751 stats.num_similarity_checks += 1;
752 *num_checks += 1;
753 match prep.operation {
754 Operation::InternalDiff { algorithm } => {
755 let tokens = crate::blob::InternedInput::new(prep.old.intern_source(), prep.new.intern_source());
756 let diff = crate::blob::Diff::compute(algorithm, &tokens);
757 let removed_bytes = diff::removed_bytes(&diff, &tokens);
758 let old_data_len = prep.old.data.as_slice().unwrap_or_default().len();
759 let new_data_len = prep.new.data.as_slice().unwrap_or_default().len();
760 let similarity = (old_data_len - removed_bytes) as f32 / old_data_len.max(new_data_len) as f32;
761 if similarity >= percentage {
762 return Ok(Some((
763 can_idx,
764 src,
765 DiffLineStats {
766 removals: diff.count_removals(),
767 insertions: diff.count_additions(),
768 before: tokens.before.len(),
769 after: tokens.after.len(),
770 similarity,
771 }
772 .into(),
773 )));
774 }
775 }
776 Operation::ExternalCommand { .. } => {
777 unreachable!("we have disabled this possibility with an option")
778 }
779 Operation::SourceOrDestinationIsBinary => {
780 }
782 }
783 }
784 }
785 Ok(None)
786}
787
788mod diff {
789 pub fn removed_bytes(diff: &crate::blob::Diff, input: &crate::blob::InternedInput<&[u8]>) -> usize {
790 diff.hunks()
791 .map(|hunk| {
792 input.before[hunk.before.start as usize..hunk.before.end as usize]
793 .iter()
794 .map(|token| input.interner[*token].len())
795 .sum::<usize>()
796 })
797 .sum()
798 }
799}
800
801#[cfg(test)]
802mod estimate_involved_items {
803 use super::estimate_involved_items;
804 use crate::rewrites::tracker::{ChangeKind, visit::SourceKind};
805
806 #[test]
807 fn renames_count_unemitted_as_sources_and_destinations() {
808 let items = [
809 (false, ChangeKind::Addition),
810 (true, ChangeKind::Deletion),
811 (true, ChangeKind::Deletion),
812 ];
813 assert_eq!(
814 estimate_involved_items(items, SourceKind::Rename),
815 (0, 1),
816 "here we only have one eligible source, hence nothing to do"
817 );
818 assert_eq!(
819 estimate_involved_items(items.into_iter().map(|t| (false, t.1)), SourceKind::Rename),
820 (2, 1),
821 "now we have more possibilities as renames count un-emitted deletions as source"
822 );
823 }
824
825 #[test]
826 fn copies_do_not_count_additions_as_sources() {
827 let items = [
828 (false, ChangeKind::Addition),
829 (true, ChangeKind::Addition),
830 (true, ChangeKind::Deletion),
831 ];
832 assert_eq!(
833 estimate_involved_items(items, SourceKind::Copy),
834 (0, 1),
835 "one addition as source, the other isn't counted as it's emitted, nor is it considered a copy-source.\
836 deletions don't count"
837 );
838 }
839
840 #[test]
841 fn copies_count_modifications_as_sources() {
842 let items = [
843 (false, ChangeKind::Addition),
844 (true, ChangeKind::Modification),
845 (false, ChangeKind::Modification),
846 ];
847 assert_eq!(
848 estimate_involved_items(items, SourceKind::Copy),
849 (2, 1),
850 "any modifications is a valid source, emitted or not"
851 );
852 }
853}