gix_diff/rewrites/
tracker.rs

1//! ### Deviation
2//!
3//! Note that the algorithm implemented here is in many ways different from what `git` does.
4//!
5//! - it's less sophisticated and doesn't use any ranking of candidates. Instead, it picks the first possible match.
6//! - the set used for copy-detection is probably smaller by default.
7
8use std::ops::Range;
9
10use bstr::{BStr, ByteSlice};
11use gix_object::tree::{EntryKind, EntryMode};
12
13use crate::{
14    blob::{platform::prepare_diff::Operation, DiffLineStats, ResourceKind},
15    rewrites::{tracker::visit::SourceKind, CopySource, Outcome, Tracker},
16    tree::visit::{Action, ChangeId, Relation},
17    Rewrites,
18};
19
20/// The kind of a change.
21#[derive(Debug, Copy, Clone, Ord, PartialOrd, PartialEq, Eq)]
22pub enum ChangeKind {
23    /// The change represents the *deletion* of an item.
24    Deletion,
25    /// The change represents the *modification* of an item.
26    Modification,
27    /// The change represents the *addition* of an item.
28    Addition,
29}
30
31/// A trait providing all functionality to abstract over the concept of a change, as seen by the [`Tracker`].
32pub trait Change: Clone {
33    /// Return the hash of the object behind this change for identification.
34    ///
35    /// Note that this is the id of the object as stored in `git`, i.e. it must have gone through workspace
36    /// conversions. What matters is that the IDs are comparable.
37    fn id(&self) -> &gix_hash::oid;
38    /// Return the relation that this change may have with other changes.
39    ///
40    /// It allows to associate a directory with its children that are added or removed at the same moment.
41    /// Note that this is ignored for modifications.
42    ///
43    /// If rename-tracking should always be on leaf-level, this should be set to `None` consistently.
44    /// Note that trees will never be looked up by their `id` as their children are assumed to be passed in
45    /// with the respective relationship.
46    ///
47    /// Also note that the tracker only sees what's given to it, it will not lookup trees or match paths itself.
48    fn relation(&self) -> Option<Relation>;
49    /// Return the kind of this change.
50    fn kind(&self) -> ChangeKind;
51    /// Return more information about the kind of entry affected by this change.
52    fn entry_mode(&self) -> EntryMode;
53    /// Return the id of the change along with its mode.
54    fn id_and_entry_mode(&self) -> (&gix_hash::oid, EntryMode);
55}
56
57/// A set of tracked items allows to figure out their relations by figuring out their similarity.
58pub(crate) struct Item<T> {
59    /// The underlying raw change
60    change: T,
61    /// That slice into the backing for paths.
62    path: Range<usize>,
63    /// If true, this item was already emitted, i.e. seen by the caller.
64    emitted: bool,
65}
66
67impl<T: Change> Item<T> {
68    fn location<'a>(&self, backing: &'a [u8]) -> &'a BStr {
69        backing[self.path.clone()].as_ref()
70    }
71    fn entry_mode_compatible(&self, other: EntryMode) -> bool {
72        use EntryKind::*;
73        matches!(
74            (other.kind(), self.change.entry_mode().kind()),
75            (Blob | BlobExecutable, Blob | BlobExecutable) | (Link, Link) | (Tree, Tree)
76        )
77    }
78
79    fn is_source_for_destination_of(&self, kind: visit::SourceKind, dest_item_mode: EntryMode) -> bool {
80        self.entry_mode_compatible(dest_item_mode)
81            && match kind {
82                visit::SourceKind::Rename => !self.emitted && matches!(self.change.kind(), ChangeKind::Deletion),
83                visit::SourceKind::Copy => {
84                    matches!(self.change.kind(), ChangeKind::Modification)
85                }
86            }
87    }
88}
89
90/// A module with types used in the user-callback in [Tracker::emit()](crate::rewrites::Tracker::emit()).
91pub mod visit {
92    use bstr::BStr;
93    use gix_object::tree::EntryMode;
94
95    use crate::blob::DiffLineStats;
96
97    /// The source of a rewrite, rename or copy.
98    #[derive(Debug, Clone, PartialEq, PartialOrd)]
99    pub struct Source<'a, T> {
100        /// The kind of entry.
101        pub entry_mode: EntryMode,
102        /// The hash of the state of the source as seen in the object database.
103        pub id: gix_hash::ObjectId,
104        /// Further specify what kind of source this is.
105        pub kind: SourceKind,
106        /// The repository-relative location of this entry.
107        pub location: &'a BStr,
108        /// The change that was registered as source.
109        pub change: &'a T,
110        /// If this is a rewrite, indicate how many lines would need to change to turn this source into the destination.
111        pub diff: Option<DiffLineStats>,
112    }
113
114    /// Further identify the kind of [Source].
115    #[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
116    pub enum SourceKind {
117        /// This is the source of an entry that was renamed, as `source` was renamed to `destination`.
118        Rename,
119        /// This is the source of a copy, as `source` was copied into `destination`.
120        Copy,
121    }
122
123    /// A change along with a location.
124    #[derive(Debug, Clone)]
125    pub struct Destination<'a, T: Clone> {
126        /// The change at the given `location`.
127        pub change: T,
128        /// The repository-relative location of this destination.
129        pub location: &'a BStr,
130    }
131}
132
133///
134pub mod emit {
135    /// The error returned by [Tracker::emit()](super::Tracker::emit()).
136    #[derive(Debug, thiserror::Error)]
137    #[allow(missing_docs)]
138    pub enum Error {
139        #[error("Could not find blob for similarity checking")]
140        FindExistingBlob(#[from] gix_object::find::existing_object::Error),
141        #[error("Could not obtain exhaustive item set to use as possible sources for copy detection")]
142        GetItemsForExhaustiveCopyDetection(#[source] Box<dyn std::error::Error + Send + Sync>),
143        #[error(transparent)]
144        SetResource(#[from] crate::blob::platform::set_resource::Error),
145        #[error(transparent)]
146        PrepareDiff(#[from] crate::blob::platform::prepare_diff::Error),
147    }
148}
149
150/// Lifecycle
151impl<T: Change> Tracker<T> {
152    /// Create a new instance with `rewrites` configuration.
153    pub fn new(rewrites: Rewrites) -> Self {
154        Tracker {
155            items: vec![],
156            path_backing: vec![],
157            rewrites,
158            child_renames: Default::default(),
159        }
160    }
161}
162
163/// build state and find matches.
164impl<T: Change> Tracker<T> {
165    /// We may refuse the push if that information isn't needed for what we have to track.
166    pub fn try_push_change(&mut self, change: T, location: &BStr) -> Option<T> {
167        let change_kind = change.kind();
168        if let (None, ChangeKind::Modification) = (self.rewrites.copies, change_kind) {
169            return Some(change);
170        }
171
172        let entry_kind = change.entry_mode().kind();
173        if entry_kind == EntryKind::Commit {
174            return Some(change);
175        }
176        let relation = change
177            .relation()
178            .filter(|_| matches!(change_kind, ChangeKind::Addition | ChangeKind::Deletion));
179        if let (None, EntryKind::Tree) = (relation, entry_kind) {
180            return Some(change);
181        }
182
183        let start = self.path_backing.len();
184        self.path_backing.extend_from_slice(location);
185        let path = start..self.path_backing.len();
186
187        self.items.push(Item {
188            path,
189            change,
190            emitted: false,
191        });
192        None
193    }
194
195    /// Can only be called once effectively as it alters its own state to assure each item is only emitted once.
196    ///
197    /// `cb(destination, source)` is called for each item, either with `Some(source)` if it's
198    /// the destination of a copy or rename, or with `None` for source if no relation to other
199    /// items in the tracked set exist, which is like saying 'no rename or rewrite or copy' happened.
200    /// Note that directories with [relation](Relation) will be emitted if there is a match, along with all their matching
201    /// child-items which are similarly bundled as rename.
202    ///
203    /// `objects` is used to access blob data for similarity checks if required and is taken directly from the object database.
204    /// Worktree filters and text conversions will be applied afterwards automatically. Note that object-caching *should not*
205    /// be enabled as caching is implemented by `diff_cache`, after all, the blob that's actually diffed is going
206    /// through conversion steps.
207    ///
208    /// `diff_cache` is a way to retain a cache of resources that are prepared for rapid diffing, and it also controls
209    /// the diff-algorithm (provided no user-algorithm is set).
210    /// Note that we control a few options of `diff_cache` to assure it will ignore external commands.
211    /// Note that we do not control how the `diff_cache` converts resources, it's left to the caller to decide
212    /// if it should look at what's stored in `git`, or in the working tree, along with all diff-specific conversions.
213    ///
214    /// `push_source_tree(push_fn: push(change, location))` is a function that is called when the entire tree of the source
215    /// should be added as modifications by calling `push` repeatedly to use for perfect copy tracking. Note that `push`
216    /// will panic if `change` is not a modification, and it's valid to not call `push` at all.
217    pub fn emit<PushSourceTreeFn, E>(
218        &mut self,
219        mut cb: impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
220        diff_cache: &mut crate::blob::Platform,
221        objects: &impl gix_object::FindObjectOrHeader,
222        mut push_source_tree: PushSourceTreeFn,
223    ) -> Result<Outcome, emit::Error>
224    where
225        PushSourceTreeFn: FnMut(&mut dyn FnMut(T, &BStr)) -> Result<(), E>,
226        E: std::error::Error + Send + Sync + 'static,
227    {
228        fn is_parent(change: &impl Change) -> bool {
229            matches!(change.relation(), Some(Relation::Parent(_)))
230        }
231        diff_cache.options.skip_internal_diff_if_external_is_configured = false;
232
233        // The point of this is to optimize for identity-based lookups, which should be easy to find
234        // by partitioning.
235        fn by_id_and_location<T: Change>(a: &Item<T>, b: &Item<T>) -> std::cmp::Ordering {
236            a.change
237                .id()
238                .cmp(b.change.id())
239                .then_with(|| a.path.start.cmp(&b.path.start).then(a.path.end.cmp(&b.path.end)))
240        }
241
242        // Early abort: if there is no pair, don't do anything.
243        let has_work = {
244            let (mut num_deletions, mut num_additions, mut num_modifications) = (0, 0, 0);
245            let mut has_work = false;
246            for change in &self.items {
247                match change.change.kind() {
248                    ChangeKind::Deletion => {
249                        num_deletions += 1;
250                    }
251                    ChangeKind::Modification => {
252                        // This means we have copy-tracking enabled
253                        num_modifications += 1;
254                    }
255                    ChangeKind::Addition => num_additions += 1,
256                }
257                if (num_deletions != 0 && num_additions != 0)
258                    || (self.rewrites.copies.is_some() && num_modifications + num_additions > 1)
259                {
260                    has_work = true;
261                    break;
262                }
263            }
264            has_work
265        };
266
267        let mut out = Outcome {
268            options: self.rewrites,
269            ..Default::default()
270        };
271        if has_work {
272            self.items.sort_by(by_id_and_location);
273
274            // Rewrites by directory (without local changes) can be pruned out quickly,
275            // by finding only parents, their counterpart, and then all children can be matched by
276            // relationship ID.
277            self.match_pairs_of_kind(
278                visit::SourceKind::Rename,
279                &mut cb,
280                None, /* by identity for parents */
281                &mut out,
282                diff_cache,
283                objects,
284                Some(is_parent),
285            )?;
286
287            self.match_pairs_of_kind(
288                visit::SourceKind::Rename,
289                &mut cb,
290                self.rewrites.percentage,
291                &mut out,
292                diff_cache,
293                objects,
294                None,
295            )?;
296
297            self.match_renamed_directories(&mut cb)?;
298
299            if let Some(copies) = self.rewrites.copies {
300                self.match_pairs_of_kind(
301                    visit::SourceKind::Copy,
302                    &mut cb,
303                    copies.percentage,
304                    &mut out,
305                    diff_cache,
306                    objects,
307                    None,
308                )?;
309
310                match copies.source {
311                    CopySource::FromSetOfModifiedFiles => {}
312                    CopySource::FromSetOfModifiedFilesAndAllSources => {
313                        push_source_tree(&mut |change, location| {
314                            if self.try_push_change(change, location).is_none() {
315                                // make sure these aren't viable to be emitted anymore.
316                                self.items.last_mut().expect("just pushed").emitted = true;
317                            }
318                        })
319                        .map_err(|err| emit::Error::GetItemsForExhaustiveCopyDetection(Box::new(err)))?;
320                        self.items.sort_by(by_id_and_location);
321
322                        self.match_pairs_of_kind(
323                            visit::SourceKind::Copy,
324                            &mut cb,
325                            copies.percentage,
326                            &mut out,
327                            diff_cache,
328                            objects,
329                            None,
330                        )?;
331                    }
332                }
333            }
334        }
335
336        self.items
337            .sort_by(|a, b| a.location(&self.path_backing).cmp(b.location(&self.path_backing)));
338        for item in self.items.drain(..).filter(|item| !item.emitted) {
339            if cb(
340                visit::Destination {
341                    location: item.location(&self.path_backing),
342                    change: item.change,
343                },
344                None,
345            )
346            .is_break()
347            {
348                break;
349            }
350        }
351        Ok(out)
352    }
353}
354
355impl<T: Change> Tracker<T> {
356    #[allow(clippy::too_many_arguments)]
357    fn match_pairs_of_kind(
358        &mut self,
359        kind: visit::SourceKind,
360        cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
361        percentage: Option<f32>,
362        out: &mut Outcome,
363        diff_cache: &mut crate::blob::Platform,
364        objects: &impl gix_object::FindObjectOrHeader,
365        filter: Option<fn(&T) -> bool>,
366    ) -> Result<(), emit::Error> {
367        // we try to cheaply reduce the set of possibilities first, before possibly looking more exhaustively.
368        let needs_second_pass = !needs_exact_match(percentage);
369
370        // https://github.com/git/git/blob/cc01bad4a9f566cf4453c7edd6b433851b0835e2/diffcore-rename.c#L350-L369
371        // We would need a hashmap to be OK to not use the limit here, otherwise the performance is too bad.
372        // This also means we don't find all renames if we hit the rename limit.
373        if self
374            .match_pairs(cb, None /* by identity */, kind, out, diff_cache, objects, filter)?
375            .is_break()
376        {
377            return Ok(());
378        }
379        if needs_second_pass {
380            let is_limited = if self.rewrites.limit == 0 {
381                false
382            } else {
383                let (num_src, num_dst) =
384                    estimate_involved_items(self.items.iter().map(|item| (item.emitted, item.change.kind())), kind);
385                let permutations = num_src * num_dst;
386                if permutations > self.rewrites.limit {
387                    match kind {
388                        visit::SourceKind::Rename => {
389                            out.num_similarity_checks_skipped_for_rename_tracking_due_to_limit = permutations;
390                        }
391                        visit::SourceKind::Copy => {
392                            out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit = permutations;
393                        }
394                    }
395                    true
396                } else {
397                    false
398                }
399            };
400            if !is_limited {
401                let _ = self.match_pairs(cb, percentage, kind, out, diff_cache, objects, None)?;
402            }
403        }
404        Ok(())
405    }
406
407    #[allow(clippy::too_many_arguments)]
408    fn match_pairs(
409        &mut self,
410        cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
411        percentage: Option<f32>,
412        kind: visit::SourceKind,
413        stats: &mut Outcome,
414        diff_cache: &mut crate::blob::Platform,
415        objects: &impl gix_object::FindObjectOrHeader,
416        filter: Option<fn(&T) -> bool>,
417    ) -> Result<Action, emit::Error> {
418        let mut dest_ofs = 0;
419        let mut num_checks = 0;
420        let max_checks = {
421            let limit = self.rewrites.limit.saturating_pow(2);
422            // There can be trees with a lot of entries and pathological search behaviour, as they can be repeated
423            // and then have a lot of similar hashes. This also means we have to search a lot of candidates which
424            // can be too slow despite best attempts. So play it save and detect such cases 'roughly' by amount of items.
425            if self.items.len() < 100_000 {
426                0
427            } else {
428                limit
429            }
430        };
431
432        while let Some((mut dest_idx, dest)) = self.items[dest_ofs..].iter().enumerate().find_map(|(idx, item)| {
433            (!item.emitted
434                && matches!(item.change.kind(), ChangeKind::Addition)
435                && filter.map_or_else(
436                    || {
437                        self.rewrites.track_empty
438                            // We always want to keep track of entries that are involved of a directory rename.
439                            // Note that this may still match them up arbitrarily if empty, but empty is empty.
440                            || matches!(item.change.relation(), Some(Relation::ChildOfParent(_)))
441                            || {
442                                let id = item.change.id();
443                                id != gix_hash::ObjectId::empty_blob(id.kind())
444                            }
445                    },
446                    |f| f(&item.change),
447                ))
448            .then_some((idx, item))
449        }) {
450            dest_idx += dest_ofs;
451            dest_ofs = dest_idx + 1;
452            self.items[dest_idx].location(&self.path_backing);
453            let src = find_match(
454                &self.items,
455                dest,
456                dest_idx,
457                percentage,
458                kind,
459                stats,
460                objects,
461                diff_cache,
462                &self.path_backing,
463                &mut num_checks,
464            )?
465            .map(|(src_idx, src, diff)| {
466                let (id, entry_mode) = src.change.id_and_entry_mode();
467                let id = id.to_owned();
468                let location = src.location(&self.path_backing);
469                (
470                    visit::Source {
471                        entry_mode,
472                        id,
473                        kind,
474                        location,
475                        change: &src.change,
476                        diff,
477                    },
478                    src_idx,
479                )
480            });
481            if max_checks != 0 && num_checks > max_checks {
482                gix_trace::warn!(
483                    "Cancelled rename matching as there were too many iterations ({num_checks} > {max_checks})"
484                );
485                return Ok(std::ops::ControlFlow::Break(()));
486            }
487            let Some((src, src_idx)) = src else {
488                continue;
489            };
490            let location = dest.location(&self.path_backing);
491            let change = dest.change.clone();
492            let dest = visit::Destination { change, location };
493            let relations = if percentage.is_none() {
494                src.change.relation().zip(dest.change.relation())
495            } else {
496                None
497            };
498            let res = cb(dest, Some(src));
499
500            self.items[dest_idx].emitted = true;
501            self.items[src_idx].emitted = true;
502
503            if res.is_break() {
504                return Ok(std::ops::ControlFlow::Break(()));
505            }
506
507            match relations {
508                Some((Relation::Parent(src), Relation::Parent(dst))) => {
509                    let res = self.emit_child_renames_matching_identity(cb, kind, src, dst)?;
510                    if res.is_break() {
511                        return Ok(std::ops::ControlFlow::Break(()));
512                    }
513                }
514                Some((Relation::ChildOfParent(src), Relation::ChildOfParent(dst))) => {
515                    self.child_renames.insert((src, dst));
516                }
517                _ => {}
518            }
519        }
520        Ok(std::ops::ControlFlow::Continue(()))
521    }
522
523    /// Emit the children of `src_parent_id` and `dst_parent_id` as pairs of exact matches, which are assumed
524    /// as `src` and `dst` were an exact match (so all children have to match exactly).
525    /// Note that we intentionally do not record them as their parents will be emitted, too.
526    fn emit_child_renames_matching_identity(
527        &mut self,
528        cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
529        kind: visit::SourceKind,
530        src_parent_id: ChangeId,
531        dst_parent_id: ChangeId,
532    ) -> Result<Action, emit::Error> {
533        debug_assert_ne!(
534            src_parent_id, dst_parent_id,
535            "src and destination directories must be distinct"
536        );
537        let (mut src_items, mut dst_items) = (Vec::with_capacity(1), Vec::with_capacity(1));
538        for item in self.items.iter_mut().filter(|item| !item.emitted) {
539            match item.change.relation() {
540                Some(Relation::ChildOfParent(id)) if id == src_parent_id => {
541                    src_items.push((item.change.id().to_owned(), item));
542                }
543                Some(Relation::ChildOfParent(id)) if id == dst_parent_id => {
544                    dst_items.push((item.change.id().to_owned(), item));
545                }
546                _ => continue,
547            }
548        }
549
550        for ((src_id, src_item), (dst_id, dst_item)) in src_items.into_iter().zip(dst_items) {
551            // Since the parent items are already identical by ID, we know that the children will also match, we just
552            // double-check to still have a chance to be correct in case some of that goes wrong.
553            if src_id == dst_id
554                && filename(src_item.location(&self.path_backing)) == filename(dst_item.location(&self.path_backing))
555            {
556                let entry_mode = src_item.change.entry_mode();
557                let location = src_item.location(&self.path_backing);
558                let src = visit::Source {
559                    entry_mode,
560                    id: src_id,
561                    kind,
562                    location,
563                    change: &src_item.change,
564                    diff: None,
565                };
566                let location = dst_item.location(&self.path_backing);
567                let change = dst_item.change.clone();
568                let dst = visit::Destination { change, location };
569                let res = cb(dst, Some(src));
570
571                src_item.emitted = true;
572                dst_item.emitted = true;
573
574                if res.is_break() {
575                    return Ok(res);
576                }
577            } else {
578                gix_trace::warn!("Children of parents with change-id {src_parent_id} and {dst_parent_id} were not equal, even though their parents claimed to be");
579                break;
580            }
581        }
582        Ok(std::ops::ControlFlow::Continue(()))
583    }
584
585    /// Find directories with relation id that haven't been emitted yet and store them for lookup.
586    /// Then use the previously stored emitted renames with relation id to learn which directories they 'link'
587    /// and emit them, too.
588    /// Note that this works whenever top-level directories are renamed because they are always added and deleted,
589    /// and we only match those. Thus, one rewrite inside the directory is enough.
590    fn match_renamed_directories(
591        &mut self,
592        cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
593    ) -> Result<(), emit::Error> {
594        fn unemitted_directory_matching_relation_id<T: Change>(items: &[Item<T>], child_id: ChangeId) -> Option<usize> {
595            items.iter().position(|i| {
596                !i.emitted && matches!(i.change.relation(), Some(Relation::Parent(pid)) if pid == child_id)
597            })
598        }
599        for (deleted_child_id, added_child_id) in &self.child_renames {
600            let Some(src_idx) = unemitted_directory_matching_relation_id(&self.items, *deleted_child_id) else {
601                continue;
602            };
603            let Some(dst_idx) = unemitted_directory_matching_relation_id(&self.items, *added_child_id) else {
604                // This could go wrong in case there are mismatches, so be defensive here.
605                // But generally, we'd expect the destination item to exist.
606                continue;
607            };
608
609            let (src_item, dst_item) = (&self.items[src_idx], &self.items[dst_idx]);
610            let entry_mode = src_item.change.entry_mode();
611            let location = src_item.location(&self.path_backing);
612            let src = visit::Source {
613                entry_mode,
614                id: src_item.change.id().to_owned(),
615                kind: SourceKind::Rename,
616                location,
617                change: &src_item.change,
618                diff: None,
619            };
620            let location = dst_item.location(&self.path_backing);
621            let change = dst_item.change.clone();
622            let dst = visit::Destination { change, location };
623            let res = cb(dst, Some(src));
624
625            self.items[src_idx].emitted = true;
626            self.items[dst_idx].emitted = true;
627
628            if res.is_break() {
629                return Ok(());
630            }
631        }
632        Ok(())
633    }
634}
635
636fn filename(path: &BStr) -> &BStr {
637    path.rfind_byte(b'/').map_or(path, |idx| path[idx + 1..].as_bstr())
638}
639
640/// Returns the amount of viable sources and destinations for `items` as eligible for the given `kind` of operation.
641fn estimate_involved_items(
642    items: impl IntoIterator<Item = (bool, ChangeKind)>,
643    kind: visit::SourceKind,
644) -> (usize, usize) {
645    items
646        .into_iter()
647        .filter(|(emitted, _)| match kind {
648            visit::SourceKind::Rename => !*emitted,
649            visit::SourceKind::Copy => true,
650        })
651        .fold((0, 0), |(mut src, mut dest), (emitted, change_kind)| {
652            match change_kind {
653                ChangeKind::Addition => {
654                    if kind == visit::SourceKind::Rename || !emitted {
655                        dest += 1;
656                    }
657                }
658                ChangeKind::Deletion => {
659                    if kind == visit::SourceKind::Rename {
660                        src += 1;
661                    }
662                }
663                ChangeKind::Modification => {
664                    if kind == visit::SourceKind::Copy {
665                        src += 1;
666                    }
667                }
668            }
669            (src, dest)
670        })
671}
672
673fn needs_exact_match(percentage: Option<f32>) -> bool {
674    percentage.is_none_or(|p| p >= 1.0)
675}
676
677/// <`src_idx`, src, possibly diff stat>
678type SourceTuple<'a, T> = (usize, &'a Item<T>, Option<DiffLineStats>);
679
680/// Find `item` in our set of items ignoring `item_idx` to avoid finding ourselves, by similarity indicated by `percentage`.
681/// The latter can be `None` or `Some(x)` where `x>=1` for identity, and anything else for similarity.
682/// We also ignore emitted items entirely.
683/// Use `kind` to indicate what kind of match we are looking for, which might be deletions matching an `item` addition, or
684/// any non-deletion otherwise.
685/// Note that we always try to find by identity first even if a percentage is given as it's much faster and may reduce the set
686/// of items to be searched.
687#[allow(clippy::too_many_arguments)]
688fn find_match<'a, T: Change>(
689    items: &'a [Item<T>],
690    item: &Item<T>,
691    item_idx: usize,
692    percentage: Option<f32>,
693    kind: visit::SourceKind,
694    stats: &mut Outcome,
695    objects: &impl gix_object::FindObjectOrHeader,
696    diff_cache: &mut crate::blob::Platform,
697    path_backing: &[u8],
698    num_checks: &mut usize,
699) -> Result<Option<SourceTuple<'a, T>>, emit::Error> {
700    let (item_id, item_mode) = item.change.id_and_entry_mode();
701    if needs_exact_match(percentage) || item_mode.is_link() {
702        let first_idx = items.partition_point(|a| a.change.id() < item_id);
703        let range = items.get(first_idx..).map(|slice| {
704            let end = slice
705                .iter()
706                .position(|a| a.change.id() != item_id)
707                .map_or(items.len(), |idx| first_idx + idx);
708            first_idx..end
709        });
710        let range = match range {
711            Some(range) => range,
712            None => return Ok(None),
713        };
714        if range.is_empty() {
715            return Ok(None);
716        }
717        let res = items[range.clone()].iter().enumerate().find_map(|(mut src_idx, src)| {
718            src_idx += range.start;
719            *num_checks += 1;
720            (src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)).then_some((src_idx, src, None))
721        });
722        if let Some(src) = res {
723            return Ok(Some(src));
724        }
725    } else if item_mode.is_blob() {
726        let mut has_new = false;
727        let percentage = percentage.expect("it's set to something below 1.0 and we assured this");
728
729        for (can_idx, src) in items
730            .iter()
731            .enumerate()
732            .filter(|(src_idx, src)| *src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode))
733        {
734            if !has_new {
735                diff_cache.set_resource(
736                    item_id.to_owned(),
737                    item_mode.kind(),
738                    item.location(path_backing),
739                    ResourceKind::NewOrDestination,
740                    objects,
741                )?;
742                has_new = true;
743            }
744            let (src_id, src_mode) = src.change.id_and_entry_mode();
745            diff_cache.set_resource(
746                src_id.to_owned(),
747                src_mode.kind(),
748                src.location(path_backing),
749                ResourceKind::OldOrSource,
750                objects,
751            )?;
752            let prep = diff_cache.prepare_diff()?;
753            stats.num_similarity_checks += 1;
754            *num_checks += 1;
755            match prep.operation {
756                Operation::InternalDiff { algorithm } => {
757                    let tokens =
758                        crate::blob::intern::InternedInput::new(prep.old.intern_source(), prep.new.intern_source());
759                    let counts = crate::blob::diff(
760                        algorithm,
761                        &tokens,
762                        crate::blob::sink::Counter::new(diff::Statistics {
763                            removed_bytes: 0,
764                            input: &tokens,
765                        }),
766                    );
767                    let old_data_len = prep.old.data.as_slice().unwrap_or_default().len();
768                    let new_data_len = prep.new.data.as_slice().unwrap_or_default().len();
769                    let similarity = (old_data_len - counts.wrapped) as f32 / old_data_len.max(new_data_len) as f32;
770                    if similarity >= percentage {
771                        return Ok(Some((
772                            can_idx,
773                            src,
774                            DiffLineStats {
775                                removals: counts.removals,
776                                insertions: counts.insertions,
777                                before: tokens.before.len().try_into().expect("interner handles only u32"),
778                                after: tokens.after.len().try_into().expect("interner handles only u32"),
779                                similarity,
780                            }
781                            .into(),
782                        )));
783                    }
784                }
785                Operation::ExternalCommand { .. } => {
786                    unreachable!("we have disabled this possibility with an option")
787                }
788                Operation::SourceOrDestinationIsBinary => {
789                    // TODO: figure out if git does more here
790                }
791            }
792        }
793    }
794    Ok(None)
795}
796
797mod diff {
798    use std::ops::Range;
799
800    pub struct Statistics<'a, 'data> {
801        pub removed_bytes: usize,
802        pub input: &'a crate::blob::intern::InternedInput<&'data [u8]>,
803    }
804
805    impl crate::blob::Sink for Statistics<'_, '_> {
806        type Out = usize;
807
808        fn process_change(&mut self, before: Range<u32>, _after: Range<u32>) {
809            self.removed_bytes += self.input.before[before.start as usize..before.end as usize]
810                .iter()
811                .map(|token| self.input.interner[*token].len())
812                .sum::<usize>();
813        }
814
815        fn finish(self) -> Self::Out {
816            self.removed_bytes
817        }
818    }
819}
820
821#[cfg(test)]
822mod estimate_involved_items {
823    use super::estimate_involved_items;
824    use crate::rewrites::tracker::{visit::SourceKind, ChangeKind};
825
826    #[test]
827    fn renames_count_unemitted_as_sources_and_destinations() {
828        let items = [
829            (false, ChangeKind::Addition),
830            (true, ChangeKind::Deletion),
831            (true, ChangeKind::Deletion),
832        ];
833        assert_eq!(
834            estimate_involved_items(items, SourceKind::Rename),
835            (0, 1),
836            "here we only have one eligible source, hence nothing to do"
837        );
838        assert_eq!(
839            estimate_involved_items(items.into_iter().map(|t| (false, t.1)), SourceKind::Rename),
840            (2, 1),
841            "now we have more possibilities as renames count un-emitted deletions as source"
842        );
843    }
844
845    #[test]
846    fn copies_do_not_count_additions_as_sources() {
847        let items = [
848            (false, ChangeKind::Addition),
849            (true, ChangeKind::Addition),
850            (true, ChangeKind::Deletion),
851        ];
852        assert_eq!(
853            estimate_involved_items(items, SourceKind::Copy),
854            (0, 1),
855            "one addition as source, the other isn't counted as it's emitted, nor is it considered a copy-source.\
856            deletions don't count"
857        );
858    }
859
860    #[test]
861    fn copies_count_modifications_as_sources() {
862        let items = [
863            (false, ChangeKind::Addition),
864            (true, ChangeKind::Modification),
865            (false, ChangeKind::Modification),
866        ];
867        assert_eq!(
868            estimate_involved_items(items, SourceKind::Copy),
869            (2, 1),
870            "any modifications is a valid source, emitted or not"
871        );
872    }
873}