gix_blame/
types.rs

1use gix_hash::ObjectId;
2use gix_object::bstr::BString;
3use smallvec::SmallVec;
4use std::ops::RangeInclusive;
5use std::{
6    num::NonZeroU32,
7    ops::{AddAssign, Range, SubAssign},
8};
9
10use crate::file::function::tokens_for_diffing;
11use crate::Error;
12
13/// A type to represent one or more line ranges to blame in a file.
14///
15/// It handles the conversion between git's 1-based inclusive ranges and the internal
16/// 0-based exclusive ranges used by the blame algorithm.
17///
18/// # Examples
19///
20/// ```rust
21/// use gix_blame::BlameRanges;
22///
23/// // Blame lines 20 through 40 (inclusive)
24/// let range = BlameRanges::from_one_based_inclusive_range(20..=40);
25///
26/// // Blame multiple ranges
27/// let mut ranges = BlameRanges::from_one_based_inclusive_ranges(vec![
28///     1..=4, // Lines 1-4
29///    10..=14, // Lines 10-14
30/// ]
31/// );
32/// ```
33///
34/// # Line Number Representation
35///
36/// This type uses 1-based inclusive ranges to mirror `git`'s behaviour:
37/// - A range of `20..=40` represents 21 lines, spanning from line 20 up to and including line 40
38/// - This will be converted to `19..40` internally as the algorithm uses 0-based ranges that are exclusive at the end
39///
40/// # Empty Ranges
41/// You can blame the entire file by calling `BlameRanges::default()`, or by passing an empty vector to `from_one_based_inclusive_ranges`.
42#[derive(Debug, Clone, Default)]
43pub enum BlameRanges {
44    /// Blame the entire file.
45    #[default]
46    WholeFile,
47    /// Blame ranges in 0-based exclusive format.
48    PartialFile(Vec<Range<u32>>),
49}
50
51/// Lifecycle
52impl BlameRanges {
53    /// Create from a single 0-based range.
54    ///
55    /// Note that the input range is 1-based inclusive, as used by git, and
56    /// the output is a zero-based `BlameRanges` instance.
57    pub fn from_one_based_inclusive_range(range: RangeInclusive<u32>) -> Result<Self, Error> {
58        let zero_based_range = Self::inclusive_to_zero_based_exclusive(range)?;
59        Ok(Self::PartialFile(vec![zero_based_range]))
60    }
61
62    /// Create from multiple 0-based ranges.
63    ///
64    /// Note that the input ranges are 1-based inclusive, as used by git, and
65    /// the output is a zero-based `BlameRanges` instance.
66    ///
67    /// If the input vector is empty, the result will be `WholeFile`.
68    pub fn from_one_based_inclusive_ranges(ranges: Vec<RangeInclusive<u32>>) -> Result<Self, Error> {
69        if ranges.is_empty() {
70            return Ok(Self::WholeFile);
71        }
72
73        let zero_based_ranges = ranges
74            .into_iter()
75            .map(Self::inclusive_to_zero_based_exclusive)
76            .collect::<Vec<_>>();
77        let mut result = Self::PartialFile(vec![]);
78        for range in zero_based_ranges {
79            result.merge_zero_based_exclusive_range(range?);
80        }
81        Ok(result)
82    }
83
84    /// Convert a 1-based inclusive range to a 0-based exclusive range.
85    fn inclusive_to_zero_based_exclusive(range: RangeInclusive<u32>) -> Result<Range<u32>, Error> {
86        if range.start() == &0 {
87            return Err(Error::InvalidOneBasedLineRange);
88        }
89        let start = range.start() - 1;
90        let end = *range.end();
91        Ok(start..end)
92    }
93}
94
95impl BlameRanges {
96    /// Add a single range to blame.
97    ///
98    /// The new range will be merged with any overlapping existing ranges.
99    pub fn add_one_based_inclusive_range(&mut self, new_range: RangeInclusive<u32>) -> Result<(), Error> {
100        let zero_based_range = Self::inclusive_to_zero_based_exclusive(new_range)?;
101        self.merge_zero_based_exclusive_range(zero_based_range);
102
103        Ok(())
104    }
105
106    /// Adds a new ranges, merging it with any existing overlapping ranges.
107    fn merge_zero_based_exclusive_range(&mut self, new_range: Range<u32>) {
108        match self {
109            Self::PartialFile(ref mut ranges) => {
110                // Partition ranges into those that don't overlap and those that do.
111                let (mut non_overlapping, overlapping): (Vec<_>, Vec<_>) = ranges
112                    .drain(..)
113                    .partition(|range| new_range.end < range.start || range.end < new_range.start);
114
115                let merged_range = overlapping.into_iter().fold(new_range, |acc, range| {
116                    acc.start.min(range.start)..acc.end.max(range.end)
117                });
118
119                non_overlapping.push(merged_range);
120
121                *ranges = non_overlapping;
122                ranges.sort_by(|a, b| a.start.cmp(&b.start));
123            }
124            Self::WholeFile => *self = Self::PartialFile(vec![new_range]),
125        }
126    }
127
128    /// Gets zero-based exclusive ranges.
129    pub fn to_zero_based_exclusive_ranges(&self, max_lines: u32) -> Vec<Range<u32>> {
130        match self {
131            Self::WholeFile => {
132                let full_range = 0..max_lines;
133                vec![full_range]
134            }
135            Self::PartialFile(ranges) => ranges
136                .iter()
137                .filter_map(|range| {
138                    if range.end < max_lines {
139                        return Some(range.clone());
140                    }
141
142                    if range.start < max_lines {
143                        Some(range.start..max_lines)
144                    } else {
145                        None
146                    }
147                })
148                .collect(),
149        }
150    }
151}
152
153/// Options to be passed to [`file()`](crate::file()).
154#[derive(Default, Debug, Clone)]
155pub struct Options {
156    /// The algorithm to use for diffing.
157    pub diff_algorithm: gix_diff::blob::Algorithm,
158    /// The ranges to blame in the file.
159    pub ranges: BlameRanges,
160    /// Don't consider commits before the given date.
161    pub since: Option<gix_date::Time>,
162    /// Determine if rename tracking should be performed, and how.
163    pub rewrites: Option<gix_diff::Rewrites>,
164    /// Collect debug information whenever there's a diff or rename that affects the outcome of a
165    /// blame.
166    pub debug_track_path: bool,
167}
168
169/// Represents a change during history traversal for blame. It is supposed to capture enough
170/// information to allow reconstruction of the way a blame was performed, i. e. the path the
171/// history traversal, combined with repeated diffing of two subsequent states in this history, has
172/// taken.
173///
174/// This is intended for debugging purposes.
175#[derive(Clone, Debug)]
176pub struct BlamePathEntry {
177    /// The path to the *Source File* in the blob after the change.
178    pub source_file_path: BString,
179    /// The path to the *Source File* in the blob before the change. Allows
180    /// detection of renames. `None` for root commits.
181    pub previous_source_file_path: Option<BString>,
182    /// The commit id associated with the state after the change.
183    pub commit_id: ObjectId,
184    /// The blob id associated with the state after the change.
185    pub blob_id: ObjectId,
186    /// The blob id associated with the state before the change.
187    pub previous_blob_id: ObjectId,
188    /// When there is more than one `BlamePathEntry` for a commit, this indicates to which parent
189    /// commit the change is related.
190    pub parent_index: usize,
191}
192
193/// The outcome of [`file()`](crate::file()).
194#[derive(Debug, Default, Clone)]
195pub struct Outcome {
196    /// One entry in sequential order, to associate a hunk in the blamed file with the source commit (and its lines)
197    /// that introduced it.
198    pub entries: Vec<BlameEntry>,
199    /// A buffer with the file content of the *Blamed File*, ready for tokenization.
200    pub blob: Vec<u8>,
201    /// Additional information about the amount of work performed to produce the blame.
202    pub statistics: Statistics,
203    /// Contains a log of all changes that affected the outcome of this blame.
204    pub blame_path: Option<Vec<BlamePathEntry>>,
205}
206
207/// Additional information about the performed operations.
208#[derive(Debug, Default, Copy, Clone)]
209pub struct Statistics {
210    /// The amount of commits it traversed until the blame was complete.
211    pub commits_traversed: usize,
212    /// The amount of trees that were decoded to find the entry of the file to blame.
213    pub trees_decoded: usize,
214    /// The amount of tree-diffs to see if the filepath was added, deleted or modified. These diffs
215    /// are likely partial as they are cancelled as soon as a change to the blamed file is
216    /// detected.
217    pub trees_diffed: usize,
218    /// The amount of tree-diffs to see if the file was moved (or rewritten, in git terminology).
219    /// These diffs are likely partial as they are cancelled as soon as a change to the blamed file
220    /// is detected.
221    pub trees_diffed_with_rewrites: usize,
222    /// The amount of blobs there were compared to each other to learn what changed between commits.
223    /// Note that in order to diff a blob, one needs to load both versions from the database.
224    pub blobs_diffed: usize,
225}
226
227impl Outcome {
228    /// Return an iterator over each entry in [`Self::entries`], along with its lines, line by line.
229    ///
230    /// Note that [`Self::blob`] must be tokenized in exactly the same way as the tokenizer that was used
231    /// to perform the diffs, which is what this method assures.
232    pub fn entries_with_lines(&self) -> impl Iterator<Item = (BlameEntry, Vec<BString>)> + '_ {
233        use gix_diff::blob::intern::TokenSource;
234        let mut interner = gix_diff::blob::intern::Interner::new(self.blob.len() / 100);
235        let lines_as_tokens: Vec<_> = tokens_for_diffing(&self.blob)
236            .tokenize()
237            .map(|token| interner.intern(token))
238            .collect();
239        self.entries.iter().map(move |e| {
240            (
241                e.clone(),
242                lines_as_tokens[e.range_in_blamed_file()]
243                    .iter()
244                    .map(|token| BString::new(interner[*token].into()))
245                    .collect(),
246            )
247        })
248    }
249}
250
251/// Describes the offset of a particular hunk relative to the *Blamed File*.
252#[derive(Clone, Copy, Debug, PartialEq)]
253pub enum Offset {
254    /// The amount of lines to add.
255    Added(u32),
256    /// The amount of lines to remove.
257    Deleted(u32),
258}
259
260impl Offset {
261    /// Shift the given `range` according to our offset.
262    pub fn shifted_range(&self, range: &Range<u32>) -> Range<u32> {
263        match self {
264            Offset::Added(added) => {
265                debug_assert!(range.start >= *added, "{self:?} {range:?}");
266                Range {
267                    start: range.start - added,
268                    end: range.end - added,
269                }
270            }
271            Offset::Deleted(deleted) => Range {
272                start: range.start + deleted,
273                end: range.end + deleted,
274            },
275        }
276    }
277}
278
279impl AddAssign<u32> for Offset {
280    fn add_assign(&mut self, rhs: u32) {
281        match self {
282            Self::Added(added) => *self = Self::Added(*added + rhs),
283            Self::Deleted(deleted) => {
284                if rhs > *deleted {
285                    *self = Self::Added(rhs - *deleted);
286                } else {
287                    *self = Self::Deleted(*deleted - rhs);
288                }
289            }
290        }
291    }
292}
293
294impl SubAssign<u32> for Offset {
295    fn sub_assign(&mut self, rhs: u32) {
296        match self {
297            Self::Added(added) => {
298                if rhs > *added {
299                    *self = Self::Deleted(rhs - *added);
300                } else {
301                    *self = Self::Added(*added - rhs);
302                }
303            }
304            Self::Deleted(deleted) => *self = Self::Deleted(*deleted + rhs),
305        }
306    }
307}
308
309/// A mapping of a section of the *Blamed File* to the section in a *Source File* that introduced it.
310///
311/// Both ranges are of the same size, but may use different [starting points](Range::start). Naturally,
312/// they have the same content, which is the reason they are in what is returned by [`file()`](crate::file()).
313#[derive(Clone, Debug, PartialEq)]
314pub struct BlameEntry {
315    /// The index of the token in the *Blamed File* (typically lines) where this entry begins.
316    pub start_in_blamed_file: u32,
317    /// The index of the token in the *Source File* (typically lines) where this entry begins.
318    ///
319    /// This is possibly offset compared to `start_in_blamed_file`.
320    pub start_in_source_file: u32,
321    /// The amount of lines the hunk is spanning.
322    pub len: NonZeroU32,
323    /// The commit that introduced the section into the *Source File*.
324    pub commit_id: ObjectId,
325    /// The *Source File*'s name, in case it differs from *Blamed File*'s name.
326    /// This happens when the file was renamed.
327    pub source_file_name: Option<BString>,
328}
329
330impl BlameEntry {
331    /// Create a new instance.
332    pub fn new(
333        range_in_blamed_file: Range<u32>,
334        range_in_source_file: Range<u32>,
335        commit_id: ObjectId,
336        source_file_name: Option<BString>,
337    ) -> Self {
338        debug_assert!(
339            range_in_blamed_file.end > range_in_blamed_file.start,
340            "{range_in_blamed_file:?}"
341        );
342        debug_assert!(
343            range_in_source_file.end > range_in_source_file.start,
344            "{range_in_source_file:?}"
345        );
346        debug_assert_eq!(range_in_source_file.len(), range_in_blamed_file.len());
347
348        Self {
349            start_in_blamed_file: range_in_blamed_file.start,
350            start_in_source_file: range_in_source_file.start,
351            len: NonZeroU32::new(range_in_blamed_file.len() as u32).expect("BUG: hunks are never empty"),
352            commit_id,
353            source_file_name,
354        }
355    }
356}
357
358impl BlameEntry {
359    /// Return the range of tokens this entry spans in the *Blamed File*.
360    pub fn range_in_blamed_file(&self) -> Range<usize> {
361        let start = self.start_in_blamed_file as usize;
362        start..start + self.len.get() as usize
363    }
364    /// Return the range of tokens this entry spans in the *Source File*.
365    pub fn range_in_source_file(&self) -> Range<usize> {
366        let start = self.start_in_source_file as usize;
367        start..start + self.len.get() as usize
368    }
369}
370
371pub(crate) trait LineRange {
372    fn shift_by(&self, offset: Offset) -> Self;
373}
374
375impl LineRange for Range<u32> {
376    fn shift_by(&self, offset: Offset) -> Self {
377        offset.shifted_range(self)
378    }
379}
380
381/// Tracks the hunks in the *Blamed File* that are not yet associated with the commit that introduced them.
382#[derive(Debug, PartialEq)]
383pub struct UnblamedHunk {
384    /// The range in the file that is being blamed that this hunk represents.
385    pub range_in_blamed_file: Range<u32>,
386    /// Maps a commit to the range in a source file (i.e. *Blamed File* at a revision) that is
387    /// equal to `range_in_blamed_file`. Since `suspects` rarely contains more than 1 item, it can
388    /// efficiently be stored as a `SmallVec`.
389    pub suspects: SmallVec<[(ObjectId, Range<u32>); 1]>,
390    /// The *Source File*'s name, in case it differs from *Blamed File*'s name.
391    pub source_file_name: Option<BString>,
392}
393
394impl UnblamedHunk {
395    pub(crate) fn new(from_range_in_blamed_file: Range<u32>, suspect: ObjectId) -> Self {
396        let range_start = from_range_in_blamed_file.start;
397        let range_end = from_range_in_blamed_file.end;
398
399        UnblamedHunk {
400            range_in_blamed_file: range_start..range_end,
401            suspects: [(suspect, range_start..range_end)].into(),
402            source_file_name: None,
403        }
404    }
405
406    pub(crate) fn has_suspect(&self, suspect: &ObjectId) -> bool {
407        self.suspects.iter().any(|entry| entry.0 == *suspect)
408    }
409
410    pub(crate) fn get_range(&self, suspect: &ObjectId) -> Option<&Range<u32>> {
411        self.suspects
412            .iter()
413            .find(|entry| entry.0 == *suspect)
414            .map(|entry| &entry.1)
415    }
416}
417
418#[derive(Debug)]
419pub(crate) enum Either<T, U> {
420    Left(T),
421    Right(U),
422}
423
424/// A single change between two blobs, or an unchanged region.
425///
426/// Line numbers refer to the file that is referred to as `after` or `NewOrDestination`, depending
427/// on the context.
428#[derive(Clone, Debug, PartialEq)]
429pub enum Change {
430    /// A range of tokens that wasn't changed.
431    Unchanged(Range<u32>),
432    /// `(added_line_range, num_deleted_in_before)`
433    AddedOrReplaced(Range<u32>, u32),
434    /// `(line_to_start_deletion_at, num_deleted_in_before)`
435    Deleted(u32, u32),
436}