gix_blame/
types.rs

1use gix_hash::ObjectId;
2use gix_object::bstr::BString;
3use smallvec::SmallVec;
4use std::ops::RangeInclusive;
5use std::{
6    num::NonZeroU32,
7    ops::{AddAssign, Range, SubAssign},
8};
9
10use crate::file::function::tokens_for_diffing;
11use crate::Error;
12
13/// A type to represent one or more line ranges to blame in a file.
14///
15/// It handles the conversion between git's 1-based inclusive ranges and the internal
16/// 0-based exclusive ranges used by the blame algorithm.
17///
18/// # Examples
19///
20/// ```rust
21/// use gix_blame::BlameRanges;
22///
23/// // Blame lines 20 through 40 (inclusive)
24/// let range = BlameRanges::from_range(20..=40);
25///
26/// // Blame multiple ranges
27/// let mut ranges = BlameRanges::new();
28/// ranges.add_range(1..=4);   // Lines 1-4
29/// ranges.add_range(10..=14); // Lines 10-14
30/// ```
31///
32/// # Line Number Representation
33///
34/// This type uses 1-based inclusive ranges to mirror `git`'s behaviour:
35/// - A range of `20..=40` represents 21 lines, spanning from line 20 up to and including line 40
36/// - This will be converted to `19..40` internally as the algorithm uses 0-based ranges that are exclusive at the end
37///
38/// # Empty Ranges
39///
40/// An empty `BlameRanges` (created via `BlameRanges::new()` or `BlameRanges::default()`) means
41/// to blame the entire file, similar to running `git blame` without line number arguments.
42#[derive(Debug, Clone, Default)]
43pub struct BlameRanges {
44    /// The ranges to blame, stored as 1-based inclusive ranges
45    /// An empty Vec means blame the entire file
46    ranges: Vec<RangeInclusive<u32>>,
47}
48
49/// Lifecycle
50impl BlameRanges {
51    /// Create a new empty BlameRanges instance.
52    ///
53    /// An empty instance means to blame the entire file.
54    pub fn new() -> Self {
55        Self::default()
56    }
57
58    /// Create from a single range.
59    ///
60    /// The range is 1-based, similar to git's line number format.
61    pub fn from_range(range: RangeInclusive<u32>) -> Self {
62        Self { ranges: vec![range] }
63    }
64
65    /// Create from multiple ranges.
66    ///
67    /// All ranges are 1-based.
68    /// Overlapping or adjacent ranges will be merged.
69    pub fn from_ranges(ranges: Vec<RangeInclusive<u32>>) -> Self {
70        let mut result = Self::new();
71        for range in ranges {
72            result.merge_range(range);
73        }
74        result
75    }
76}
77
78impl BlameRanges {
79    /// Add a single range to blame.
80    ///
81    /// The range should be 1-based inclusive.
82    /// If the new range overlaps with or is adjacent to an existing range,
83    /// they will be merged into a single range.
84    pub fn add_range(&mut self, new_range: RangeInclusive<u32>) {
85        self.merge_range(new_range);
86    }
87
88    /// Attempts to merge the new range with any existing ranges.
89    /// If no merge is possible, add it as a new range.
90    fn merge_range(&mut self, new_range: RangeInclusive<u32>) {
91        // Check if this range can be merged with any existing range
92        for range in &mut self.ranges {
93            // Check if ranges overlap or are adjacent
94            if new_range.start() <= range.end() && range.start() <= new_range.end() {
95                *range = *range.start().min(new_range.start())..=*range.end().max(new_range.end());
96                return;
97            }
98        }
99        // If no overlap found, add it as a new range
100        self.ranges.push(new_range);
101    }
102
103    /// Convert the 1-based inclusive ranges to 0-based exclusive ranges.
104    ///
105    /// This is used internally by the blame algorithm to convert from git's line number format
106    /// to the internal format used for processing.
107    ///
108    /// # Errors
109    ///
110    /// Returns `Error::InvalidLineRange` if:
111    /// - Any range starts at 0 (must be 1-based)
112    /// - Any range extends beyond the file's length
113    /// - Any range has the same start and end
114    pub fn to_zero_based_exclusive(&self, max_lines: u32) -> Result<Vec<Range<u32>>, Error> {
115        if self.ranges.is_empty() {
116            let range = 0..max_lines;
117            return Ok(vec![range]);
118        }
119
120        let mut result = Vec::with_capacity(self.ranges.len());
121        for range in &self.ranges {
122            if *range.start() == 0 {
123                return Err(Error::InvalidLineRange);
124            }
125            let start = range.start() - 1;
126            let end = *range.end();
127            if start >= max_lines || end > max_lines || start == end {
128                return Err(Error::InvalidLineRange);
129            }
130            result.push(start..end);
131        }
132        Ok(result)
133    }
134
135    /// Returns true if no specific ranges are set (meaning blame entire file)
136    pub fn is_empty(&self) -> bool {
137        self.ranges.is_empty()
138    }
139}
140
141/// Options to be passed to [`file()`](crate::file()).
142#[derive(Default, Debug, Clone)]
143pub struct Options {
144    /// The algorithm to use for diffing.
145    pub diff_algorithm: gix_diff::blob::Algorithm,
146    /// The ranges to blame in the file.
147    pub range: BlameRanges,
148    /// Don't consider commits before the given date.
149    pub since: Option<gix_date::Time>,
150    /// Determine if rename tracking should be performed, and how.
151    pub rewrites: Option<gix_diff::Rewrites>,
152    /// Collect debug information whenever there's a diff or rename that affects the outcome of a
153    /// blame.
154    pub debug_track_path: bool,
155}
156
157/// Represents a change during history traversal for blame. It is supposed to capture enough
158/// information to allow reconstruction of the way a blame was performed, i. e. the path the
159/// history traversal, combined with repeated diffing of two subsequent states in this history, has
160/// taken.
161///
162/// This is intended for debugging purposes.
163#[derive(Clone, Debug)]
164pub struct BlamePathEntry {
165    /// The path to the *Source File* in the blob after the change.
166    pub source_file_path: BString,
167    /// The path to the *Source File* in the blob before the change. Allows
168    /// detection of renames. `None` for root commits.
169    pub previous_source_file_path: Option<BString>,
170    /// The commit id associated with the state after the change.
171    pub commit_id: ObjectId,
172    /// The blob id associated with the state after the change.
173    pub blob_id: ObjectId,
174    /// The blob id associated with the state before the change.
175    pub previous_blob_id: ObjectId,
176    /// When there is more than one `BlamePathEntry` for a commit, this indicates to which parent
177    /// commit the change is related.
178    pub parent_index: usize,
179}
180
181/// The outcome of [`file()`](crate::file()).
182#[derive(Debug, Default, Clone)]
183pub struct Outcome {
184    /// One entry in sequential order, to associate a hunk in the blamed file with the source commit (and its lines)
185    /// that introduced it.
186    pub entries: Vec<BlameEntry>,
187    /// A buffer with the file content of the *Blamed File*, ready for tokenization.
188    pub blob: Vec<u8>,
189    /// Additional information about the amount of work performed to produce the blame.
190    pub statistics: Statistics,
191    /// Contains a log of all changes that affected the outcome of this blame.
192    pub blame_path: Option<Vec<BlamePathEntry>>,
193}
194
195/// Additional information about the performed operations.
196#[derive(Debug, Default, Copy, Clone)]
197pub struct Statistics {
198    /// The amount of commits it traversed until the blame was complete.
199    pub commits_traversed: usize,
200    /// The amount of trees that were decoded to find the entry of the file to blame.
201    pub trees_decoded: usize,
202    /// The amount of tree-diffs to see if the filepath was added, deleted or modified. These diffs
203    /// are likely partial as they are cancelled as soon as a change to the blamed file is
204    /// detected.
205    pub trees_diffed: usize,
206    /// The amount of tree-diffs to see if the file was moved (or rewritten, in git terminology).
207    /// These diffs are likely partial as they are cancelled as soon as a change to the blamed file
208    /// is detected.
209    pub trees_diffed_with_rewrites: usize,
210    /// The amount of blobs there were compared to each other to learn what changed between commits.
211    /// Note that in order to diff a blob, one needs to load both versions from the database.
212    pub blobs_diffed: usize,
213}
214
215impl Outcome {
216    /// Return an iterator over each entry in [`Self::entries`], along with its lines, line by line.
217    ///
218    /// Note that [`Self::blob`] must be tokenized in exactly the same way as the tokenizer that was used
219    /// to perform the diffs, which is what this method assures.
220    pub fn entries_with_lines(&self) -> impl Iterator<Item = (BlameEntry, Vec<BString>)> + '_ {
221        use gix_diff::blob::intern::TokenSource;
222        let mut interner = gix_diff::blob::intern::Interner::new(self.blob.len() / 100);
223        let lines_as_tokens: Vec<_> = tokens_for_diffing(&self.blob)
224            .tokenize()
225            .map(|token| interner.intern(token))
226            .collect();
227        self.entries.iter().map(move |e| {
228            (
229                e.clone(),
230                lines_as_tokens[e.range_in_blamed_file()]
231                    .iter()
232                    .map(|token| BString::new(interner[*token].into()))
233                    .collect(),
234            )
235        })
236    }
237}
238
239/// Describes the offset of a particular hunk relative to the *Blamed File*.
240#[derive(Clone, Copy, Debug, PartialEq)]
241pub enum Offset {
242    /// The amount of lines to add.
243    Added(u32),
244    /// The amount of lines to remove.
245    Deleted(u32),
246}
247
248impl Offset {
249    /// Shift the given `range` according to our offset.
250    pub fn shifted_range(&self, range: &Range<u32>) -> Range<u32> {
251        match self {
252            Offset::Added(added) => {
253                debug_assert!(range.start >= *added, "{self:?} {range:?}");
254                Range {
255                    start: range.start - added,
256                    end: range.end - added,
257                }
258            }
259            Offset::Deleted(deleted) => Range {
260                start: range.start + deleted,
261                end: range.end + deleted,
262            },
263        }
264    }
265}
266
267impl AddAssign<u32> for Offset {
268    fn add_assign(&mut self, rhs: u32) {
269        match self {
270            Self::Added(added) => *self = Self::Added(*added + rhs),
271            Self::Deleted(deleted) => {
272                if rhs > *deleted {
273                    *self = Self::Added(rhs - *deleted);
274                } else {
275                    *self = Self::Deleted(*deleted - rhs);
276                }
277            }
278        }
279    }
280}
281
282impl SubAssign<u32> for Offset {
283    fn sub_assign(&mut self, rhs: u32) {
284        match self {
285            Self::Added(added) => {
286                if rhs > *added {
287                    *self = Self::Deleted(rhs - *added);
288                } else {
289                    *self = Self::Added(*added - rhs);
290                }
291            }
292            Self::Deleted(deleted) => *self = Self::Deleted(*deleted + rhs),
293        }
294    }
295}
296
297/// A mapping of a section of the *Blamed File* to the section in a *Source File* that introduced it.
298///
299/// Both ranges are of the same size, but may use different [starting points](Range::start). Naturally,
300/// they have the same content, which is the reason they are in what is returned by [`file()`](crate::file()).
301#[derive(Clone, Debug, PartialEq)]
302pub struct BlameEntry {
303    /// The index of the token in the *Blamed File* (typically lines) where this entry begins.
304    pub start_in_blamed_file: u32,
305    /// The index of the token in the *Source File* (typically lines) where this entry begins.
306    ///
307    /// This is possibly offset compared to `start_in_blamed_file`.
308    pub start_in_source_file: u32,
309    /// The amount of lines the hunk is spanning.
310    pub len: NonZeroU32,
311    /// The commit that introduced the section into the *Source File*.
312    pub commit_id: ObjectId,
313    /// The *Source File*'s name, in case it differs from *Blamed File*'s name.
314    /// This happens when the file was renamed.
315    pub source_file_name: Option<BString>,
316}
317
318impl BlameEntry {
319    /// Create a new instance.
320    pub fn new(
321        range_in_blamed_file: Range<u32>,
322        range_in_source_file: Range<u32>,
323        commit_id: ObjectId,
324        source_file_name: Option<BString>,
325    ) -> Self {
326        debug_assert!(
327            range_in_blamed_file.end > range_in_blamed_file.start,
328            "{range_in_blamed_file:?}"
329        );
330        debug_assert!(
331            range_in_source_file.end > range_in_source_file.start,
332            "{range_in_source_file:?}"
333        );
334        debug_assert_eq!(range_in_source_file.len(), range_in_blamed_file.len());
335
336        Self {
337            start_in_blamed_file: range_in_blamed_file.start,
338            start_in_source_file: range_in_source_file.start,
339            len: NonZeroU32::new(range_in_blamed_file.len() as u32).expect("BUG: hunks are never empty"),
340            commit_id,
341            source_file_name,
342        }
343    }
344}
345
346impl BlameEntry {
347    /// Return the range of tokens this entry spans in the *Blamed File*.
348    pub fn range_in_blamed_file(&self) -> Range<usize> {
349        let start = self.start_in_blamed_file as usize;
350        start..start + self.len.get() as usize
351    }
352    /// Return the range of tokens this entry spans in the *Source File*.
353    pub fn range_in_source_file(&self) -> Range<usize> {
354        let start = self.start_in_source_file as usize;
355        start..start + self.len.get() as usize
356    }
357}
358
359pub(crate) trait LineRange {
360    fn shift_by(&self, offset: Offset) -> Self;
361}
362
363impl LineRange for Range<u32> {
364    fn shift_by(&self, offset: Offset) -> Self {
365        offset.shifted_range(self)
366    }
367}
368
369/// Tracks the hunks in the *Blamed File* that are not yet associated with the commit that introduced them.
370#[derive(Debug, PartialEq)]
371pub struct UnblamedHunk {
372    /// The range in the file that is being blamed that this hunk represents.
373    pub range_in_blamed_file: Range<u32>,
374    /// Maps a commit to the range in a source file (i.e. *Blamed File* at a revision) that is
375    /// equal to `range_in_blamed_file`. Since `suspects` rarely contains more than 1 item, it can
376    /// efficiently be stored as a `SmallVec`.
377    pub suspects: SmallVec<[(ObjectId, Range<u32>); 1]>,
378    /// The *Source File*'s name, in case it differs from *Blamed File*'s name.
379    pub source_file_name: Option<BString>,
380}
381
382impl UnblamedHunk {
383    pub(crate) fn has_suspect(&self, suspect: &ObjectId) -> bool {
384        self.suspects.iter().any(|entry| entry.0 == *suspect)
385    }
386
387    pub(crate) fn get_range(&self, suspect: &ObjectId) -> Option<&Range<u32>> {
388        self.suspects
389            .iter()
390            .find(|entry| entry.0 == *suspect)
391            .map(|entry| &entry.1)
392    }
393}
394
395#[derive(Debug)]
396pub(crate) enum Either<T, U> {
397    Left(T),
398    Right(U),
399}
400
401/// A single change between two blobs, or an unchanged region.
402///
403/// Line numbers refer to the file that is referred to as `after` or `NewOrDestination`, depending
404/// on the context.
405#[derive(Clone, Debug, PartialEq)]
406pub enum Change {
407    /// A range of tokens that wasn't changed.
408    Unchanged(Range<u32>),
409    /// `(added_line_range, num_deleted_in_before)`
410    AddedOrReplaced(Range<u32>, u32),
411    /// `(line_to_start_deletion_at, num_deleted_in_before)`
412    Deleted(u32, u32),
413}