gix_blame/file/
function.rs

1use super::{process_changes, Change, UnblamedHunk};
2use crate::{BlameEntry, Error, Outcome, Statistics};
3use gix_diff::blob::intern::TokenSource;
4use gix_diff::tree::Visit;
5use gix_hash::ObjectId;
6use gix_object::{
7    bstr::{BStr, BString},
8    FindExt,
9};
10use std::num::NonZeroU32;
11use std::ops::Range;
12
13/// Produce a list of consecutive [`BlameEntry`] instances to indicate in which commits the ranges of the file
14/// at `traverse[0]:<file_path>` originated in.
15///
16/// ## Paramters
17///
18/// * `odb`
19///    - Access to database objects, also for used for diffing.
20///    - Should have an object cache for good diff performance.
21/// * `traverse`
22///    - The list of commits from the most recent to prior ones, following all parents sorted
23///      by time.
24///    - It's paramount that older commits are returned after newer ones.
25///    - The first commit returned here is the first eligible commit to be responsible for parts of `file_path`.
26/// * `file_path`
27///    - A *slash-separated* worktree-relative path to the file to blame.
28/// * `range`
29///    - A 1-based inclusive range, in order to mirror `git`’s behaviour. `Some(20..40)` represents
30///      21 lines, spanning from line 20 up to and including line 40. This will be converted to
31///      `19..40` internally as the algorithm uses 0-based ranges that are exclusive at the end.
32/// * `resource_cache`
33///    - Used for diffing trees.
34///
35/// ## The algorithm
36///
37/// *For brevity, `HEAD` denotes the starting point of the blame operation. It could be any commit, or even commits that
38/// represent the worktree state.
39/// We begin with a single *Unblamed Hunk* and a single suspect, usually the `HEAD` commit as the commit containing the
40/// *Blamed File*, so that it contains the entire file, with the first commit being a candidate for the entire *Blamed File*.
41/// We traverse the commit graph starting at the first suspect, and see if there have been changes to `file_path`.
42/// If so, we have found a *Source File* and a *Suspect* commit, and have hunks that represent these changes.
43/// Now the *Unblamed Hunk* is split at the boundaries of each matching change, creating a new *Unblamed Hunk* on each side,
44/// along with a [`BlameEntry`] to represent the match.
45/// This is repeated until there are no non-empty *Unblamed Hunk*s left.
46///
47/// At a high level, what we want to do is the following:
48///
49/// - get the commit
50/// - walk through its parents
51///   - for each parent, do a diff and mark lines that don’t have a suspect yet (this is the term
52///     used in `libgit2`), but that have been changed in this commit
53///
54/// The algorithm in `libgit2` works by going through parents and keeping a linked list of blame
55/// suspects. It can be visualized as follows:
56//
57// <---------------------------------------->
58// <---------------><----------------------->
59// <---><----------><----------------------->
60// <---><----------><-------><-----><------->
61// <---><---><-----><-------><-----><------->
62// <---><---><-----><-------><-----><-><-><->
63pub fn file<E>(
64    odb: impl gix_object::Find + gix_object::FindHeader,
65    traverse: impl IntoIterator<Item = Result<gix_traverse::commit::Info, E>>,
66    resource_cache: &mut gix_diff::blob::Platform,
67    file_path: &BStr,
68    range: Option<Range<u32>>,
69) -> Result<Outcome, Error>
70where
71    E: Into<Box<dyn std::error::Error + Send + Sync + 'static>>,
72{
73    let mut traverse = traverse.into_iter().peekable();
74    let Some(Ok(suspect)) = traverse.peek().map(|res| res.as_ref().map(|item| item.id)) else {
75        return Err(Error::EmptyTraversal);
76    };
77    let _span = gix_trace::coarse!("gix_blame::file()", ?file_path, ?suspect);
78
79    let mut stats = Statistics::default();
80    let (mut buf, mut buf2, mut buf3) = (Vec::new(), Vec::new(), Vec::new());
81    let blamed_file_entry_id = find_path_entry_in_commit(&odb, &suspect, file_path, &mut buf, &mut buf2, &mut stats)?
82        .ok_or_else(|| Error::FileMissing {
83        file_path: file_path.to_owned(),
84        commit_id: suspect,
85    })?;
86    let blamed_file_blob = odb.find_blob(&blamed_file_entry_id, &mut buf)?.data.to_vec();
87    let num_lines_in_blamed = {
88        let mut interner = gix_diff::blob::intern::Interner::new(blamed_file_blob.len() / 100);
89        tokens_for_diffing(&blamed_file_blob)
90            .tokenize()
91            .map(|token| interner.intern(token))
92            .count()
93    } as u32;
94
95    // Binary or otherwise empty?
96    if num_lines_in_blamed == 0 {
97        return Ok(Outcome::default());
98    }
99
100    let range_in_blamed_file = one_based_inclusive_to_zero_based_exclusive_range(range, num_lines_in_blamed)?;
101    let mut hunks_to_blame = vec![UnblamedHunk {
102        range_in_blamed_file: range_in_blamed_file.clone(),
103        suspects: [(suspect, range_in_blamed_file)].into(),
104    }];
105
106    let mut out = Vec::new();
107    let mut diff_state = gix_diff::tree::State::default();
108    let mut previous_entry: Option<(ObjectId, ObjectId)> = None;
109    'outer: while let Some(item) = traverse.next() {
110        if hunks_to_blame.is_empty() {
111            break;
112        }
113        let commit = item.map_err(|err| Error::Traverse(err.into()))?;
114        let suspect = commit.id;
115        stats.commits_traversed += 1;
116
117        let parent_ids = commit.parent_ids;
118        if parent_ids.is_empty() {
119            if traverse.peek().is_none() {
120                // I’m not entirely sure if this is correct yet. `suspect`, at this point, is the `id` of
121                // the last `item` that was yielded by `traverse`, so it makes sense to assign the
122                // remaining lines to it, even though we don’t explicitly check whether that is true
123                // here. We could perhaps use diff-tree-to-tree to compare `suspect`
124                // against an empty tree to validate this assumption.
125                if unblamed_to_out_is_done(&mut hunks_to_blame, &mut out, suspect) {
126                    break 'outer;
127                }
128            }
129
130            // There is more, keep looking.
131            continue;
132        }
133
134        let mut entry = previous_entry
135            .take()
136            .filter(|(id, _)| *id == suspect)
137            .map(|(_, entry)| entry);
138        if entry.is_none() {
139            entry = find_path_entry_in_commit(&odb, &suspect, file_path, &mut buf, &mut buf2, &mut stats)?;
140        }
141
142        let Some(entry_id) = entry else {
143            continue;
144        };
145
146        for (pid, parent_id) in parent_ids.iter().enumerate() {
147            if let Some(parent_entry_id) =
148                find_path_entry_in_commit(&odb, parent_id, file_path, &mut buf, &mut buf2, &mut stats)?
149            {
150                let no_change_in_entry = entry_id == parent_entry_id;
151                if pid == 0 {
152                    previous_entry = Some((*parent_id, parent_entry_id));
153                }
154                if no_change_in_entry {
155                    pass_blame_from_to(suspect, *parent_id, &mut hunks_to_blame);
156                    continue 'outer;
157                }
158            }
159        }
160
161        let more_than_one_parent = parent_ids.len() > 1;
162        for parent_id in parent_ids {
163            let changes_for_file_path = tree_diff_at_file_path(
164                &odb,
165                file_path,
166                commit.id,
167                parent_id,
168                &mut stats,
169                &mut diff_state,
170                &mut buf,
171                &mut buf2,
172                &mut buf3,
173            )?;
174            let Some(modification) = changes_for_file_path else {
175                if more_than_one_parent {
176                    // None of the changes affected the file we’re currently blaming.
177                    // Copy blame to parent.
178                    for unblamed_hunk in &mut hunks_to_blame {
179                        unblamed_hunk.clone_blame(suspect, parent_id);
180                    }
181                } else {
182                    pass_blame_from_to(suspect, parent_id, &mut hunks_to_blame);
183                }
184                continue;
185            };
186
187            match modification {
188                gix_diff::tree::recorder::Change::Addition { .. } => {
189                    if more_than_one_parent {
190                        // Do nothing under the assumption that this always (or almost always)
191                        // implies that the file comes from a different parent, compared to which
192                        // it was modified, not added.
193                    } else if unblamed_to_out_is_done(&mut hunks_to_blame, &mut out, suspect) {
194                        break 'outer;
195                    }
196                }
197                gix_diff::tree::recorder::Change::Deletion { .. } => {
198                    unreachable!("We already found file_path in suspect^{{tree}}, so it can't be deleted")
199                }
200                gix_diff::tree::recorder::Change::Modification { previous_oid, oid, .. } => {
201                    let changes = blob_changes(&odb, resource_cache, oid, previous_oid, file_path, &mut stats)?;
202                    hunks_to_blame = process_changes(hunks_to_blame, changes, suspect, parent_id);
203                }
204            }
205        }
206
207        hunks_to_blame.retain_mut(|unblamed_hunk| {
208            if unblamed_hunk.suspects.len() == 1 {
209                if let Some(entry) = BlameEntry::from_unblamed_hunk(unblamed_hunk, suspect) {
210                    // At this point, we have copied blame for every hunk to a parent. Hunks
211                    // that have only `suspect` left in `suspects` have not passed blame to any
212                    // parent, and so they can be converted to a `BlameEntry` and moved to
213                    // `out`.
214                    out.push(entry);
215                    return false;
216                }
217            }
218            unblamed_hunk.remove_blame(suspect);
219            true
220        });
221
222        // This block asserts that line ranges for each suspect never overlap. If they did overlap
223        // this would mean that the same line in a *Source File* would map to more than one line in
224        // the *Blamed File* and this is not possible.
225        #[cfg(debug_assertions)]
226        {
227            let ranges = hunks_to_blame.iter().fold(
228                std::collections::BTreeMap::<ObjectId, Vec<Range<u32>>>::new(),
229                |mut acc, hunk| {
230                    for (suspect, range) in hunk.suspects.clone() {
231                        acc.entry(suspect).or_default().push(range);
232                    }
233
234                    acc
235                },
236            );
237
238            for (_, mut ranges) in ranges {
239                ranges.sort_by(|a, b| a.start.cmp(&b.start));
240
241                for window in ranges.windows(2) {
242                    if let [a, b] = window {
243                        assert!(a.end <= b.start, "#{hunks_to_blame:#?}");
244                    }
245                }
246            }
247        }
248    }
249
250    debug_assert_eq!(
251        hunks_to_blame,
252        vec![],
253        "only if there is no portion of the file left we have completed the blame"
254    );
255
256    // I don’t know yet whether it would make sense to use a data structure instead that preserves
257    // order on insertion.
258    out.sort_by(|a, b| a.start_in_blamed_file.cmp(&b.start_in_blamed_file));
259    Ok(Outcome {
260        entries: coalesce_blame_entries(out),
261        blob: blamed_file_blob,
262        statistics: stats,
263    })
264}
265
266/// This function assumes that `range` has 1-based inclusive line numbers and converts it to the
267/// format internally used: 0-based line numbers stored in ranges that are exclusive at the
268/// end.
269fn one_based_inclusive_to_zero_based_exclusive_range(
270    range: Option<Range<u32>>,
271    max_lines: u32,
272) -> Result<Range<u32>, Error> {
273    let Some(range) = range else { return Ok(0..max_lines) };
274    if range.start == 0 {
275        return Err(Error::InvalidLineRange);
276    }
277    let start = range.start - 1;
278    let end = range.end;
279    if start >= max_lines || end > max_lines || start == end {
280        return Err(Error::InvalidLineRange);
281    }
282    Ok(start..end)
283}
284
285/// Pass ownership of each unblamed hunk of `from` to `to`.
286///
287/// This happens when `from` didn't actually change anything in the blamed file.
288fn pass_blame_from_to(from: ObjectId, to: ObjectId, hunks_to_blame: &mut Vec<UnblamedHunk>) {
289    for unblamed_hunk in hunks_to_blame {
290        unblamed_hunk.pass_blame(from, to);
291    }
292}
293
294/// Convert each of the unblamed hunk in `hunks_to_blame` into a [`BlameEntry`], consuming them in the process.
295///
296/// Return `true` if we are done because `hunks_to_blame` is empty.
297fn unblamed_to_out_is_done(
298    hunks_to_blame: &mut Vec<UnblamedHunk>,
299    out: &mut Vec<BlameEntry>,
300    suspect: ObjectId,
301) -> bool {
302    let mut without_suspect = Vec::new();
303    out.extend(hunks_to_blame.drain(..).filter_map(|hunk| {
304        BlameEntry::from_unblamed_hunk(&hunk, suspect).or_else(|| {
305            without_suspect.push(hunk);
306            None
307        })
308    }));
309    *hunks_to_blame = without_suspect;
310    hunks_to_blame.is_empty()
311}
312
313/// This function merges adjacent blame entries. It merges entries that are adjacent both in the
314/// blamed file and in the source file that introduced them. This follows `git`’s
315/// behaviour. `libgit2`, as of 2024-09-19, only checks whether two entries are adjacent in the
316/// blamed file which can result in different blames in certain edge cases. See [the commit][1]
317/// that introduced the extra check into `git` for context. See [this commit][2] for a way to test
318/// for this behaviour in `git`.
319///
320/// [1]: https://github.com/git/git/commit/c2ebaa27d63bfb7c50cbbdaba90aee4efdd45d0a
321/// [2]: https://github.com/git/git/commit/6dbf0c7bebd1c71c44d786ebac0f2b3f226a0131
322fn coalesce_blame_entries(lines_blamed: Vec<BlameEntry>) -> Vec<BlameEntry> {
323    let len = lines_blamed.len();
324    lines_blamed
325        .into_iter()
326        .fold(Vec::with_capacity(len), |mut acc, entry| {
327            let previous_entry = acc.last();
328
329            if let Some(previous_entry) = previous_entry {
330                let previous_blamed_range = previous_entry.range_in_blamed_file();
331                let current_blamed_range = entry.range_in_blamed_file();
332                let previous_source_range = previous_entry.range_in_source_file();
333                let current_source_range = entry.range_in_source_file();
334                if previous_entry.commit_id == entry.commit_id
335                    && previous_blamed_range.end == current_blamed_range.start
336                    // As of 2024-09-19, the check below only is in `git`, but not in `libgit2`.
337                    && previous_source_range.end == current_source_range.start
338                {
339                    // let combined_range =
340                    let coalesced_entry = BlameEntry {
341                        start_in_blamed_file: previous_blamed_range.start as u32,
342                        start_in_source_file: previous_source_range.start as u32,
343                        len: NonZeroU32::new((current_source_range.end - previous_source_range.start) as u32)
344                            .expect("BUG: hunks are never zero-sized"),
345                        commit_id: previous_entry.commit_id,
346                    };
347
348                    acc.pop();
349                    acc.push(coalesced_entry);
350                } else {
351                    acc.push(entry);
352                }
353
354                acc
355            } else {
356                acc.push(entry);
357
358                acc
359            }
360        })
361}
362
363#[allow(clippy::too_many_arguments)]
364fn tree_diff_at_file_path(
365    odb: impl gix_object::Find + gix_object::FindHeader,
366    file_path: &BStr,
367    id: ObjectId,
368    parent_id: ObjectId,
369    stats: &mut Statistics,
370    state: &mut gix_diff::tree::State,
371    commit_buf: &mut Vec<u8>,
372    lhs_tree_buf: &mut Vec<u8>,
373    rhs_tree_buf: &mut Vec<u8>,
374) -> Result<Option<gix_diff::tree::recorder::Change>, Error> {
375    let parent_tree = odb.find_commit(&parent_id, commit_buf)?.tree();
376    stats.commits_to_tree += 1;
377
378    let parent_tree_iter = odb.find_tree_iter(&parent_tree, lhs_tree_buf)?;
379    stats.trees_decoded += 1;
380
381    let tree_id = odb.find_commit(&id, commit_buf)?.tree();
382    stats.commits_to_tree += 1;
383
384    let tree_iter = odb.find_tree_iter(&tree_id, rhs_tree_buf)?;
385    stats.trees_decoded += 1;
386
387    struct FindChangeToPath {
388        inner: gix_diff::tree::Recorder,
389        interesting_path: BString,
390        change: Option<gix_diff::tree::recorder::Change>,
391    }
392
393    impl FindChangeToPath {
394        fn new(interesting_path: BString) -> Self {
395            let inner =
396                gix_diff::tree::Recorder::default().track_location(Some(gix_diff::tree::recorder::Location::Path));
397
398            FindChangeToPath {
399                inner,
400                interesting_path,
401                change: None,
402            }
403        }
404    }
405
406    impl Visit for FindChangeToPath {
407        fn pop_front_tracked_path_and_set_current(&mut self) {
408            self.inner.pop_front_tracked_path_and_set_current();
409        }
410
411        fn push_back_tracked_path_component(&mut self, component: &BStr) {
412            self.inner.push_back_tracked_path_component(component);
413        }
414
415        fn push_path_component(&mut self, component: &BStr) {
416            self.inner.push_path_component(component);
417        }
418
419        fn pop_path_component(&mut self) {
420            self.inner.pop_path_component();
421        }
422
423        fn visit(&mut self, change: gix_diff::tree::visit::Change) -> gix_diff::tree::visit::Action {
424            use gix_diff::tree::visit;
425            use gix_diff::tree::visit::Change::*;
426
427            if self.inner.path() == self.interesting_path {
428                self.change = Some(match change {
429                    Deletion {
430                        entry_mode,
431                        oid,
432                        relation,
433                    } => gix_diff::tree::recorder::Change::Deletion {
434                        entry_mode,
435                        oid,
436                        path: self.inner.path_clone(),
437                        relation,
438                    },
439                    Addition {
440                        entry_mode,
441                        oid,
442                        relation,
443                    } => gix_diff::tree::recorder::Change::Addition {
444                        entry_mode,
445                        oid,
446                        path: self.inner.path_clone(),
447                        relation,
448                    },
449                    Modification {
450                        previous_entry_mode,
451                        previous_oid,
452                        entry_mode,
453                        oid,
454                    } => gix_diff::tree::recorder::Change::Modification {
455                        previous_entry_mode,
456                        previous_oid,
457                        entry_mode,
458                        oid,
459                        path: self.inner.path_clone(),
460                    },
461                });
462
463                visit::Action::Cancel
464            } else {
465                visit::Action::Continue
466            }
467        }
468    }
469
470    let mut recorder = FindChangeToPath::new(file_path.into());
471    let result = gix_diff::tree(parent_tree_iter, tree_iter, state, &odb, &mut recorder);
472    stats.trees_diffed += 1;
473
474    match result {
475        Ok(_) | Err(gix_diff::tree::Error::Cancelled) => Ok(recorder.change),
476        Err(error) => Err(Error::DiffTree(error)),
477    }
478}
479
480fn blob_changes(
481    odb: impl gix_object::Find + gix_object::FindHeader,
482    resource_cache: &mut gix_diff::blob::Platform,
483    oid: ObjectId,
484    previous_oid: ObjectId,
485    file_path: &BStr,
486    stats: &mut Statistics,
487) -> Result<Vec<Change>, Error> {
488    /// Record all [`Change`]s to learn about additions, deletions and unchanged portions of a *Source File*.
489    struct ChangeRecorder {
490        last_seen_after_end: u32,
491        hunks: Vec<Change>,
492        total_number_of_lines: u32,
493    }
494
495    impl ChangeRecorder {
496        /// `total_number_of_lines` is used to fill in the last unchanged hunk if needed
497        /// so that the entire file is represented by [`Change`].
498        fn new(total_number_of_lines: u32) -> Self {
499            ChangeRecorder {
500                last_seen_after_end: 0,
501                hunks: Vec::new(),
502                total_number_of_lines,
503            }
504        }
505    }
506
507    impl gix_diff::blob::Sink for ChangeRecorder {
508        type Out = Vec<Change>;
509
510        fn process_change(&mut self, before: Range<u32>, after: Range<u32>) {
511            // This checks for unchanged hunks.
512            if after.start > self.last_seen_after_end {
513                self.hunks
514                    .push(Change::Unchanged(self.last_seen_after_end..after.start));
515            }
516
517            match (!before.is_empty(), !after.is_empty()) {
518                (_, true) => {
519                    self.hunks.push(Change::AddedOrReplaced(
520                        after.start..after.end,
521                        before.end - before.start,
522                    ));
523                }
524                (true, false) => {
525                    self.hunks.push(Change::Deleted(after.start, before.end - before.start));
526                }
527                (false, false) => unreachable!("BUG: imara-diff provided a non-change"),
528            }
529            self.last_seen_after_end = after.end;
530        }
531
532        fn finish(mut self) -> Self::Out {
533            if self.total_number_of_lines > self.last_seen_after_end {
534                self.hunks
535                    .push(Change::Unchanged(self.last_seen_after_end..self.total_number_of_lines));
536            }
537            self.hunks
538        }
539    }
540
541    resource_cache.set_resource(
542        previous_oid,
543        gix_object::tree::EntryKind::Blob,
544        file_path,
545        gix_diff::blob::ResourceKind::OldOrSource,
546        &odb,
547    )?;
548    resource_cache.set_resource(
549        oid,
550        gix_object::tree::EntryKind::Blob,
551        file_path,
552        gix_diff::blob::ResourceKind::NewOrDestination,
553        &odb,
554    )?;
555
556    let outcome = resource_cache.prepare_diff()?;
557    let input = gix_diff::blob::intern::InternedInput::new(
558        tokens_for_diffing(outcome.old.data.as_slice().unwrap_or_default()),
559        tokens_for_diffing(outcome.new.data.as_slice().unwrap_or_default()),
560    );
561    let number_of_lines_in_destination = input.after.len();
562    let change_recorder = ChangeRecorder::new(number_of_lines_in_destination as u32);
563
564    let res = gix_diff::blob::diff(gix_diff::blob::Algorithm::Histogram, &input, change_recorder);
565    stats.blobs_diffed += 1;
566    Ok(res)
567}
568
569fn find_path_entry_in_commit(
570    odb: &impl gix_object::Find,
571    commit: &gix_hash::oid,
572    file_path: &BStr,
573    buf: &mut Vec<u8>,
574    buf2: &mut Vec<u8>,
575    stats: &mut Statistics,
576) -> Result<Option<ObjectId>, Error> {
577    let commit_id = odb.find_commit(commit, buf)?.tree();
578    stats.commits_to_tree += 1;
579    let tree_iter = odb.find_tree_iter(&commit_id, buf)?;
580    stats.trees_decoded += 1;
581
582    let res = tree_iter.lookup_entry(
583        odb,
584        buf2,
585        file_path.split(|b| *b == b'/').inspect(|_| stats.trees_decoded += 1),
586    )?;
587    stats.trees_decoded -= 1;
588    Ok(res.map(|e| e.oid))
589}
590
591/// Return an iterator over tokens for use in diffing. These usually lines, but iit's important to unify them
592/// so the later access shows the right thing.
593pub(crate) fn tokens_for_diffing(data: &[u8]) -> impl TokenSource<Token = &[u8]> {
594    gix_diff::blob::sources::byte_lines_with_terminator(data)
595}