gix_blame/types.rs
1use gix_hash::ObjectId;
2use gix_object::bstr::BString;
3use smallvec::SmallVec;
4use std::ops::RangeInclusive;
5use std::{
6 num::NonZeroU32,
7 ops::{AddAssign, Range, SubAssign},
8};
9
10use crate::file::function::tokens_for_diffing;
11use crate::Error;
12
13/// A type to represent one or more line ranges to blame in a file.
14///
15/// It handles the conversion between git's 1-based inclusive ranges and the internal
16/// 0-based exclusive ranges used by the blame algorithm.
17///
18/// # Examples
19///
20/// ```rust
21/// use gix_blame::BlameRanges;
22///
23/// // Blame lines 20 through 40 (inclusive)
24/// let range = BlameRanges::from_one_based_inclusive_range(20..=40);
25///
26/// // Blame multiple ranges
27/// let mut ranges = BlameRanges::from_one_based_inclusive_ranges(vec![
28/// 1..=4, // Lines 1-4
29/// 10..=14, // Lines 10-14
30/// ]
31/// );
32/// ```
33///
34/// # Line Number Representation
35///
36/// This type uses 1-based inclusive ranges to mirror `git`'s behaviour:
37/// - A range of `20..=40` represents 21 lines, spanning from line 20 up to and including line 40
38/// - This will be converted to `19..40` internally as the algorithm uses 0-based ranges that are exclusive at the end
39///
40/// # Empty Ranges
41/// You can blame the entire file by calling `BlameRanges::default()`, or by passing an empty vector to `from_one_based_inclusive_ranges`.
42#[derive(Debug, Clone, Default)]
43pub enum BlameRanges {
44 /// Blame the entire file.
45 #[default]
46 WholeFile,
47 /// Blame ranges in 0-based exclusive format.
48 PartialFile(Vec<Range<u32>>),
49}
50
51/// Lifecycle
52impl BlameRanges {
53 /// Create from a single 0-based range.
54 ///
55 /// Note that the input range is 1-based inclusive, as used by git, and
56 /// the output is a zero-based `BlameRanges` instance.
57 pub fn from_one_based_inclusive_range(range: RangeInclusive<u32>) -> Result<Self, Error> {
58 let zero_based_range = Self::inclusive_to_zero_based_exclusive(range)?;
59 Ok(Self::PartialFile(vec![zero_based_range]))
60 }
61
62 /// Create from multiple 0-based ranges.
63 ///
64 /// Note that the input ranges are 1-based inclusive, as used by git, and
65 /// the output is a zero-based `BlameRanges` instance.
66 ///
67 /// If the input vector is empty, the result will be `WholeFile`.
68 pub fn from_one_based_inclusive_ranges(ranges: Vec<RangeInclusive<u32>>) -> Result<Self, Error> {
69 if ranges.is_empty() {
70 return Ok(Self::WholeFile);
71 }
72
73 let zero_based_ranges = ranges
74 .into_iter()
75 .map(Self::inclusive_to_zero_based_exclusive)
76 .collect::<Vec<_>>();
77 let mut result = Self::PartialFile(vec![]);
78 for range in zero_based_ranges {
79 result.merge_zero_based_exclusive_range(range?);
80 }
81 Ok(result)
82 }
83
84 /// Convert a 1-based inclusive range to a 0-based exclusive range.
85 fn inclusive_to_zero_based_exclusive(range: RangeInclusive<u32>) -> Result<Range<u32>, Error> {
86 if range.start() == &0 {
87 return Err(Error::InvalidOneBasedLineRange);
88 }
89 let start = range.start() - 1;
90 let end = *range.end();
91 Ok(start..end)
92 }
93}
94
95impl BlameRanges {
96 /// Add a single range to blame.
97 ///
98 /// The new range will be merged with any overlapping existing ranges.
99 pub fn add_one_based_inclusive_range(&mut self, new_range: RangeInclusive<u32>) -> Result<(), Error> {
100 let zero_based_range = Self::inclusive_to_zero_based_exclusive(new_range)?;
101 self.merge_zero_based_exclusive_range(zero_based_range);
102
103 Ok(())
104 }
105
106 /// Adds a new ranges, merging it with any existing overlapping ranges.
107 fn merge_zero_based_exclusive_range(&mut self, new_range: Range<u32>) {
108 match self {
109 Self::PartialFile(ref mut ranges) => {
110 // Partition ranges into those that don't overlap and those that do.
111 let (mut non_overlapping, overlapping): (Vec<_>, Vec<_>) = ranges
112 .drain(..)
113 .partition(|range| new_range.end < range.start || range.end < new_range.start);
114
115 let merged_range = overlapping.into_iter().fold(new_range, |acc, range| {
116 acc.start.min(range.start)..acc.end.max(range.end)
117 });
118
119 non_overlapping.push(merged_range);
120
121 *ranges = non_overlapping;
122 ranges.sort_by(|a, b| a.start.cmp(&b.start));
123 }
124 Self::WholeFile => *self = Self::PartialFile(vec![new_range]),
125 }
126 }
127
128 /// Gets zero-based exclusive ranges.
129 pub fn to_zero_based_exclusive_ranges(&self, max_lines: u32) -> Vec<Range<u32>> {
130 match self {
131 Self::WholeFile => {
132 let full_range = 0..max_lines;
133 vec![full_range]
134 }
135 Self::PartialFile(ranges) => ranges
136 .iter()
137 .filter_map(|range| {
138 if range.end < max_lines {
139 return Some(range.clone());
140 }
141
142 if range.start < max_lines {
143 Some(range.start..max_lines)
144 } else {
145 None
146 }
147 })
148 .collect(),
149 }
150 }
151}
152
153/// Options to be passed to [`file()`](crate::file()).
154#[derive(Default, Debug, Clone)]
155pub struct Options {
156 /// The algorithm to use for diffing.
157 pub diff_algorithm: gix_diff::blob::Algorithm,
158 /// The ranges to blame in the file.
159 pub ranges: BlameRanges,
160 /// Don't consider commits before the given date.
161 pub since: Option<gix_date::Time>,
162 /// Determine if rename tracking should be performed, and how.
163 pub rewrites: Option<gix_diff::Rewrites>,
164 /// Collect debug information whenever there's a diff or rename that affects the outcome of a
165 /// blame.
166 pub debug_track_path: bool,
167}
168
169/// Represents a change during history traversal for blame. It is supposed to capture enough
170/// information to allow reconstruction of the way a blame was performed, i. e. the path the
171/// history traversal, combined with repeated diffing of two subsequent states in this history, has
172/// taken.
173///
174/// This is intended for debugging purposes.
175#[derive(Clone, Debug)]
176pub struct BlamePathEntry {
177 /// The path to the *Source File* in the blob after the change.
178 pub source_file_path: BString,
179 /// The path to the *Source File* in the blob before the change. Allows
180 /// detection of renames. `None` for root commits.
181 pub previous_source_file_path: Option<BString>,
182 /// The commit id associated with the state after the change.
183 pub commit_id: ObjectId,
184 /// The blob id associated with the state after the change.
185 pub blob_id: ObjectId,
186 /// The blob id associated with the state before the change.
187 pub previous_blob_id: ObjectId,
188 /// When there is more than one `BlamePathEntry` for a commit, this indicates to which parent
189 /// commit the change is related.
190 pub parent_index: usize,
191}
192
193/// The outcome of [`file()`](crate::file()).
194#[derive(Debug, Default, Clone)]
195pub struct Outcome {
196 /// One entry in sequential order, to associate a hunk in the blamed file with the source commit (and its lines)
197 /// that introduced it.
198 pub entries: Vec<BlameEntry>,
199 /// A buffer with the file content of the *Blamed File*, ready for tokenization.
200 pub blob: Vec<u8>,
201 /// Additional information about the amount of work performed to produce the blame.
202 pub statistics: Statistics,
203 /// Contains a log of all changes that affected the outcome of this blame.
204 pub blame_path: Option<Vec<BlamePathEntry>>,
205}
206
207/// Additional information about the performed operations.
208#[derive(Debug, Default, Copy, Clone)]
209pub struct Statistics {
210 /// The amount of commits it traversed until the blame was complete.
211 pub commits_traversed: usize,
212 /// The amount of trees that were decoded to find the entry of the file to blame.
213 pub trees_decoded: usize,
214 /// The amount of tree-diffs to see if the filepath was added, deleted or modified. These diffs
215 /// are likely partial as they are cancelled as soon as a change to the blamed file is
216 /// detected.
217 pub trees_diffed: usize,
218 /// The amount of tree-diffs to see if the file was moved (or rewritten, in git terminology).
219 /// These diffs are likely partial as they are cancelled as soon as a change to the blamed file
220 /// is detected.
221 pub trees_diffed_with_rewrites: usize,
222 /// The amount of blobs there were compared to each other to learn what changed between commits.
223 /// Note that in order to diff a blob, one needs to load both versions from the database.
224 pub blobs_diffed: usize,
225}
226
227impl Outcome {
228 /// Return an iterator over each entry in [`Self::entries`], along with its lines, line by line.
229 ///
230 /// Note that [`Self::blob`] must be tokenized in exactly the same way as the tokenizer that was used
231 /// to perform the diffs, which is what this method assures.
232 pub fn entries_with_lines(&self) -> impl Iterator<Item = (BlameEntry, Vec<BString>)> + '_ {
233 use gix_diff::blob::intern::TokenSource;
234 let mut interner = gix_diff::blob::intern::Interner::new(self.blob.len() / 100);
235 let lines_as_tokens: Vec<_> = tokens_for_diffing(&self.blob)
236 .tokenize()
237 .map(|token| interner.intern(token))
238 .collect();
239 self.entries.iter().map(move |e| {
240 (
241 e.clone(),
242 lines_as_tokens[e.range_in_blamed_file()]
243 .iter()
244 .map(|token| BString::new(interner[*token].into()))
245 .collect(),
246 )
247 })
248 }
249}
250
251/// Describes the offset of a particular hunk relative to the *Blamed File*.
252#[derive(Clone, Copy, Debug, PartialEq)]
253pub enum Offset {
254 /// The amount of lines to add.
255 Added(u32),
256 /// The amount of lines to remove.
257 Deleted(u32),
258}
259
260impl Offset {
261 /// Shift the given `range` according to our offset.
262 pub fn shifted_range(&self, range: &Range<u32>) -> Range<u32> {
263 match self {
264 Offset::Added(added) => {
265 debug_assert!(range.start >= *added, "{self:?} {range:?}");
266 Range {
267 start: range.start - added,
268 end: range.end - added,
269 }
270 }
271 Offset::Deleted(deleted) => Range {
272 start: range.start + deleted,
273 end: range.end + deleted,
274 },
275 }
276 }
277}
278
279impl AddAssign<u32> for Offset {
280 fn add_assign(&mut self, rhs: u32) {
281 match self {
282 Self::Added(added) => *self = Self::Added(*added + rhs),
283 Self::Deleted(deleted) => {
284 if rhs > *deleted {
285 *self = Self::Added(rhs - *deleted);
286 } else {
287 *self = Self::Deleted(*deleted - rhs);
288 }
289 }
290 }
291 }
292}
293
294impl SubAssign<u32> for Offset {
295 fn sub_assign(&mut self, rhs: u32) {
296 match self {
297 Self::Added(added) => {
298 if rhs > *added {
299 *self = Self::Deleted(rhs - *added);
300 } else {
301 *self = Self::Added(*added - rhs);
302 }
303 }
304 Self::Deleted(deleted) => *self = Self::Deleted(*deleted + rhs),
305 }
306 }
307}
308
309/// A mapping of a section of the *Blamed File* to the section in a *Source File* that introduced it.
310///
311/// Both ranges are of the same size, but may use different [starting points](Range::start). Naturally,
312/// they have the same content, which is the reason they are in what is returned by [`file()`](crate::file()).
313#[derive(Clone, Debug, PartialEq)]
314pub struct BlameEntry {
315 /// The index of the token in the *Blamed File* (typically lines) where this entry begins.
316 pub start_in_blamed_file: u32,
317 /// The index of the token in the *Source File* (typically lines) where this entry begins.
318 ///
319 /// This is possibly offset compared to `start_in_blamed_file`.
320 pub start_in_source_file: u32,
321 /// The amount of lines the hunk is spanning.
322 pub len: NonZeroU32,
323 /// The commit that introduced the section into the *Source File*.
324 pub commit_id: ObjectId,
325 /// The *Source File*'s name, in case it differs from *Blamed File*'s name.
326 /// This happens when the file was renamed.
327 pub source_file_name: Option<BString>,
328}
329
330impl BlameEntry {
331 /// Create a new instance.
332 pub fn new(
333 range_in_blamed_file: Range<u32>,
334 range_in_source_file: Range<u32>,
335 commit_id: ObjectId,
336 source_file_name: Option<BString>,
337 ) -> Self {
338 debug_assert!(
339 range_in_blamed_file.end > range_in_blamed_file.start,
340 "{range_in_blamed_file:?}"
341 );
342 debug_assert!(
343 range_in_source_file.end > range_in_source_file.start,
344 "{range_in_source_file:?}"
345 );
346 debug_assert_eq!(range_in_source_file.len(), range_in_blamed_file.len());
347
348 Self {
349 start_in_blamed_file: range_in_blamed_file.start,
350 start_in_source_file: range_in_source_file.start,
351 len: NonZeroU32::new(range_in_blamed_file.len() as u32).expect("BUG: hunks are never empty"),
352 commit_id,
353 source_file_name,
354 }
355 }
356}
357
358impl BlameEntry {
359 /// Return the range of tokens this entry spans in the *Blamed File*.
360 pub fn range_in_blamed_file(&self) -> Range<usize> {
361 let start = self.start_in_blamed_file as usize;
362 start..start + self.len.get() as usize
363 }
364 /// Return the range of tokens this entry spans in the *Source File*.
365 pub fn range_in_source_file(&self) -> Range<usize> {
366 let start = self.start_in_source_file as usize;
367 start..start + self.len.get() as usize
368 }
369}
370
371pub(crate) trait LineRange {
372 fn shift_by(&self, offset: Offset) -> Self;
373}
374
375impl LineRange for Range<u32> {
376 fn shift_by(&self, offset: Offset) -> Self {
377 offset.shifted_range(self)
378 }
379}
380
381/// Tracks the hunks in the *Blamed File* that are not yet associated with the commit that introduced them.
382#[derive(Debug, PartialEq)]
383pub struct UnblamedHunk {
384 /// The range in the file that is being blamed that this hunk represents.
385 pub range_in_blamed_file: Range<u32>,
386 /// Maps a commit to the range in a source file (i.e. *Blamed File* at a revision) that is
387 /// equal to `range_in_blamed_file`. Since `suspects` rarely contains more than 1 item, it can
388 /// efficiently be stored as a `SmallVec`.
389 pub suspects: SmallVec<[(ObjectId, Range<u32>); 1]>,
390 /// The *Source File*'s name, in case it differs from *Blamed File*'s name.
391 pub source_file_name: Option<BString>,
392}
393
394impl UnblamedHunk {
395 pub(crate) fn new(from_range_in_blamed_file: Range<u32>, suspect: ObjectId) -> Self {
396 let range_start = from_range_in_blamed_file.start;
397 let range_end = from_range_in_blamed_file.end;
398
399 UnblamedHunk {
400 range_in_blamed_file: range_start..range_end,
401 suspects: [(suspect, range_start..range_end)].into(),
402 source_file_name: None,
403 }
404 }
405
406 pub(crate) fn has_suspect(&self, suspect: &ObjectId) -> bool {
407 self.suspects.iter().any(|entry| entry.0 == *suspect)
408 }
409
410 pub(crate) fn get_range(&self, suspect: &ObjectId) -> Option<&Range<u32>> {
411 self.suspects
412 .iter()
413 .find(|entry| entry.0 == *suspect)
414 .map(|entry| &entry.1)
415 }
416}
417
418#[derive(Debug)]
419pub(crate) enum Either<T, U> {
420 Left(T),
421 Right(U),
422}
423
424/// A single change between two blobs, or an unchanged region.
425///
426/// Line numbers refer to the file that is referred to as `after` or `NewOrDestination`, depending
427/// on the context.
428#[derive(Clone, Debug, PartialEq)]
429pub enum Change {
430 /// A range of tokens that wasn't changed.
431 Unchanged(Range<u32>),
432 /// `(added_line_range, num_deleted_in_before)`
433 AddedOrReplaced(Range<u32>, u32),
434 /// `(line_to_start_deletion_at, num_deleted_in_before)`
435 Deleted(u32, u32),
436}