gix_blame/types.rs
1use gix_hash::ObjectId;
2use gix_object::bstr::BString;
3use smallvec::SmallVec;
4use std::ops::RangeInclusive;
5use std::{
6 num::NonZeroU32,
7 ops::{AddAssign, Range, SubAssign},
8};
9
10use crate::file::function::tokens_for_diffing;
11use crate::Error;
12
13/// A type to represent one or more line ranges to blame in a file.
14///
15/// It handles the conversion between git's 1-based inclusive ranges and the internal
16/// 0-based exclusive ranges used by the blame algorithm.
17///
18/// # Examples
19///
20/// ```rust
21/// use gix_blame::BlameRanges;
22///
23/// // Blame lines 20 through 40 (inclusive)
24/// let range = BlameRanges::from_range(20..=40);
25///
26/// // Blame multiple ranges
27/// let mut ranges = BlameRanges::new();
28/// ranges.add_range(1..=4); // Lines 1-4
29/// ranges.add_range(10..=14); // Lines 10-14
30/// ```
31///
32/// # Line Number Representation
33///
34/// This type uses 1-based inclusive ranges to mirror `git`'s behaviour:
35/// - A range of `20..=40` represents 21 lines, spanning from line 20 up to and including line 40
36/// - This will be converted to `19..40` internally as the algorithm uses 0-based ranges that are exclusive at the end
37///
38/// # Empty Ranges
39///
40/// An empty `BlameRanges` (created via `BlameRanges::new()` or `BlameRanges::default()`) means
41/// to blame the entire file, similar to running `git blame` without line number arguments.
42#[derive(Debug, Clone, Default)]
43pub struct BlameRanges {
44 /// The ranges to blame, stored as 1-based inclusive ranges
45 /// An empty Vec means blame the entire file
46 ranges: Vec<RangeInclusive<u32>>,
47}
48
49/// Lifecycle
50impl BlameRanges {
51 /// Create a new empty BlameRanges instance.
52 ///
53 /// An empty instance means to blame the entire file.
54 pub fn new() -> Self {
55 Self::default()
56 }
57
58 /// Create from a single range.
59 ///
60 /// The range is 1-based, similar to git's line number format.
61 pub fn from_range(range: RangeInclusive<u32>) -> Self {
62 Self { ranges: vec![range] }
63 }
64
65 /// Create from multiple ranges.
66 ///
67 /// All ranges are 1-based.
68 /// Overlapping or adjacent ranges will be merged.
69 pub fn from_ranges(ranges: Vec<RangeInclusive<u32>>) -> Self {
70 let mut result = Self::new();
71 for range in ranges {
72 result.merge_range(range);
73 }
74 result
75 }
76}
77
78impl BlameRanges {
79 /// Add a single range to blame.
80 ///
81 /// The range should be 1-based inclusive.
82 /// If the new range overlaps with or is adjacent to an existing range,
83 /// they will be merged into a single range.
84 pub fn add_range(&mut self, new_range: RangeInclusive<u32>) {
85 self.merge_range(new_range);
86 }
87
88 /// Attempts to merge the new range with any existing ranges.
89 /// If no merge is possible, add it as a new range.
90 fn merge_range(&mut self, new_range: RangeInclusive<u32>) {
91 // Check if this range can be merged with any existing range
92 for range in &mut self.ranges {
93 // Check if ranges overlap or are adjacent
94 if new_range.start() <= range.end() && range.start() <= new_range.end() {
95 *range = *range.start().min(new_range.start())..=*range.end().max(new_range.end());
96 return;
97 }
98 }
99 // If no overlap found, add it as a new range
100 self.ranges.push(new_range);
101 }
102
103 /// Convert the 1-based inclusive ranges to 0-based exclusive ranges.
104 ///
105 /// This is used internally by the blame algorithm to convert from git's line number format
106 /// to the internal format used for processing.
107 ///
108 /// # Errors
109 ///
110 /// Returns `Error::InvalidLineRange` if:
111 /// - Any range starts at 0 (must be 1-based)
112 /// - Any range extends beyond the file's length
113 /// - Any range has the same start and end
114 pub fn to_zero_based_exclusive(&self, max_lines: u32) -> Result<Vec<Range<u32>>, Error> {
115 if self.ranges.is_empty() {
116 let range = 0..max_lines;
117 return Ok(vec![range]);
118 }
119
120 let mut result = Vec::with_capacity(self.ranges.len());
121 for range in &self.ranges {
122 if *range.start() == 0 {
123 return Err(Error::InvalidLineRange);
124 }
125 let start = range.start() - 1;
126 let end = *range.end();
127 if start >= max_lines || end > max_lines || start == end {
128 return Err(Error::InvalidLineRange);
129 }
130 result.push(start..end);
131 }
132 Ok(result)
133 }
134
135 /// Returns true if no specific ranges are set (meaning blame entire file)
136 pub fn is_empty(&self) -> bool {
137 self.ranges.is_empty()
138 }
139}
140
141/// Options to be passed to [`file()`](crate::file()).
142#[derive(Default, Debug, Clone)]
143pub struct Options {
144 /// The algorithm to use for diffing.
145 pub diff_algorithm: gix_diff::blob::Algorithm,
146 /// The ranges to blame in the file.
147 pub range: BlameRanges,
148 /// Don't consider commits before the given date.
149 pub since: Option<gix_date::Time>,
150 /// Determine if rename tracking should be performed, and how.
151 pub rewrites: Option<gix_diff::Rewrites>,
152 /// Collect debug information whenever there's a diff or rename that affects the outcome of a
153 /// blame.
154 pub debug_track_path: bool,
155}
156
157/// Represents a change during history traversal for blame. It is supposed to capture enough
158/// information to allow reconstruction of the way a blame was performed, i. e. the path the
159/// history traversal, combined with repeated diffing of two subsequent states in this history, has
160/// taken.
161///
162/// This is intended for debugging purposes.
163#[derive(Clone, Debug)]
164pub struct BlamePathEntry {
165 /// The path to the *Source File* in the blob after the change.
166 pub source_file_path: BString,
167 /// The path to the *Source File* in the blob before the change. Allows
168 /// detection of renames. `None` for root commits.
169 pub previous_source_file_path: Option<BString>,
170 /// The commit id associated with the state after the change.
171 pub commit_id: ObjectId,
172 /// The blob id associated with the state after the change.
173 pub blob_id: ObjectId,
174 /// The blob id associated with the state before the change.
175 pub previous_blob_id: ObjectId,
176 /// When there is more than one `BlamePathEntry` for a commit, this indicates to which parent
177 /// commit the change is related.
178 pub parent_index: usize,
179}
180
181/// The outcome of [`file()`](crate::file()).
182#[derive(Debug, Default, Clone)]
183pub struct Outcome {
184 /// One entry in sequential order, to associate a hunk in the blamed file with the source commit (and its lines)
185 /// that introduced it.
186 pub entries: Vec<BlameEntry>,
187 /// A buffer with the file content of the *Blamed File*, ready for tokenization.
188 pub blob: Vec<u8>,
189 /// Additional information about the amount of work performed to produce the blame.
190 pub statistics: Statistics,
191 /// Contains a log of all changes that affected the outcome of this blame.
192 pub blame_path: Option<Vec<BlamePathEntry>>,
193}
194
195/// Additional information about the performed operations.
196#[derive(Debug, Default, Copy, Clone)]
197pub struct Statistics {
198 /// The amount of commits it traversed until the blame was complete.
199 pub commits_traversed: usize,
200 /// The amount of trees that were decoded to find the entry of the file to blame.
201 pub trees_decoded: usize,
202 /// The amount of tree-diffs to see if the filepath was added, deleted or modified. These diffs
203 /// are likely partial as they are cancelled as soon as a change to the blamed file is
204 /// detected.
205 pub trees_diffed: usize,
206 /// The amount of tree-diffs to see if the file was moved (or rewritten, in git terminology).
207 /// These diffs are likely partial as they are cancelled as soon as a change to the blamed file
208 /// is detected.
209 pub trees_diffed_with_rewrites: usize,
210 /// The amount of blobs there were compared to each other to learn what changed between commits.
211 /// Note that in order to diff a blob, one needs to load both versions from the database.
212 pub blobs_diffed: usize,
213}
214
215impl Outcome {
216 /// Return an iterator over each entry in [`Self::entries`], along with its lines, line by line.
217 ///
218 /// Note that [`Self::blob`] must be tokenized in exactly the same way as the tokenizer that was used
219 /// to perform the diffs, which is what this method assures.
220 pub fn entries_with_lines(&self) -> impl Iterator<Item = (BlameEntry, Vec<BString>)> + '_ {
221 use gix_diff::blob::intern::TokenSource;
222 let mut interner = gix_diff::blob::intern::Interner::new(self.blob.len() / 100);
223 let lines_as_tokens: Vec<_> = tokens_for_diffing(&self.blob)
224 .tokenize()
225 .map(|token| interner.intern(token))
226 .collect();
227 self.entries.iter().map(move |e| {
228 (
229 e.clone(),
230 lines_as_tokens[e.range_in_blamed_file()]
231 .iter()
232 .map(|token| BString::new(interner[*token].into()))
233 .collect(),
234 )
235 })
236 }
237}
238
239/// Describes the offset of a particular hunk relative to the *Blamed File*.
240#[derive(Clone, Copy, Debug, PartialEq)]
241pub enum Offset {
242 /// The amount of lines to add.
243 Added(u32),
244 /// The amount of lines to remove.
245 Deleted(u32),
246}
247
248impl Offset {
249 /// Shift the given `range` according to our offset.
250 pub fn shifted_range(&self, range: &Range<u32>) -> Range<u32> {
251 match self {
252 Offset::Added(added) => {
253 debug_assert!(range.start >= *added, "{self:?} {range:?}");
254 Range {
255 start: range.start - added,
256 end: range.end - added,
257 }
258 }
259 Offset::Deleted(deleted) => Range {
260 start: range.start + deleted,
261 end: range.end + deleted,
262 },
263 }
264 }
265}
266
267impl AddAssign<u32> for Offset {
268 fn add_assign(&mut self, rhs: u32) {
269 match self {
270 Self::Added(added) => *self = Self::Added(*added + rhs),
271 Self::Deleted(deleted) => {
272 if rhs > *deleted {
273 *self = Self::Added(rhs - *deleted);
274 } else {
275 *self = Self::Deleted(*deleted - rhs);
276 }
277 }
278 }
279 }
280}
281
282impl SubAssign<u32> for Offset {
283 fn sub_assign(&mut self, rhs: u32) {
284 match self {
285 Self::Added(added) => {
286 if rhs > *added {
287 *self = Self::Deleted(rhs - *added);
288 } else {
289 *self = Self::Added(*added - rhs);
290 }
291 }
292 Self::Deleted(deleted) => *self = Self::Deleted(*deleted + rhs),
293 }
294 }
295}
296
297/// A mapping of a section of the *Blamed File* to the section in a *Source File* that introduced it.
298///
299/// Both ranges are of the same size, but may use different [starting points](Range::start). Naturally,
300/// they have the same content, which is the reason they are in what is returned by [`file()`](crate::file()).
301#[derive(Clone, Debug, PartialEq)]
302pub struct BlameEntry {
303 /// The index of the token in the *Blamed File* (typically lines) where this entry begins.
304 pub start_in_blamed_file: u32,
305 /// The index of the token in the *Source File* (typically lines) where this entry begins.
306 ///
307 /// This is possibly offset compared to `start_in_blamed_file`.
308 pub start_in_source_file: u32,
309 /// The amount of lines the hunk is spanning.
310 pub len: NonZeroU32,
311 /// The commit that introduced the section into the *Source File*.
312 pub commit_id: ObjectId,
313 /// The *Source File*'s name, in case it differs from *Blamed File*'s name.
314 /// This happens when the file was renamed.
315 pub source_file_name: Option<BString>,
316}
317
318impl BlameEntry {
319 /// Create a new instance.
320 pub fn new(
321 range_in_blamed_file: Range<u32>,
322 range_in_source_file: Range<u32>,
323 commit_id: ObjectId,
324 source_file_name: Option<BString>,
325 ) -> Self {
326 debug_assert!(
327 range_in_blamed_file.end > range_in_blamed_file.start,
328 "{range_in_blamed_file:?}"
329 );
330 debug_assert!(
331 range_in_source_file.end > range_in_source_file.start,
332 "{range_in_source_file:?}"
333 );
334 debug_assert_eq!(range_in_source_file.len(), range_in_blamed_file.len());
335
336 Self {
337 start_in_blamed_file: range_in_blamed_file.start,
338 start_in_source_file: range_in_source_file.start,
339 len: NonZeroU32::new(range_in_blamed_file.len() as u32).expect("BUG: hunks are never empty"),
340 commit_id,
341 source_file_name,
342 }
343 }
344}
345
346impl BlameEntry {
347 /// Return the range of tokens this entry spans in the *Blamed File*.
348 pub fn range_in_blamed_file(&self) -> Range<usize> {
349 let start = self.start_in_blamed_file as usize;
350 start..start + self.len.get() as usize
351 }
352 /// Return the range of tokens this entry spans in the *Source File*.
353 pub fn range_in_source_file(&self) -> Range<usize> {
354 let start = self.start_in_source_file as usize;
355 start..start + self.len.get() as usize
356 }
357}
358
359pub(crate) trait LineRange {
360 fn shift_by(&self, offset: Offset) -> Self;
361}
362
363impl LineRange for Range<u32> {
364 fn shift_by(&self, offset: Offset) -> Self {
365 offset.shifted_range(self)
366 }
367}
368
369/// Tracks the hunks in the *Blamed File* that are not yet associated with the commit that introduced them.
370#[derive(Debug, PartialEq)]
371pub struct UnblamedHunk {
372 /// The range in the file that is being blamed that this hunk represents.
373 pub range_in_blamed_file: Range<u32>,
374 /// Maps a commit to the range in a source file (i.e. *Blamed File* at a revision) that is
375 /// equal to `range_in_blamed_file`. Since `suspects` rarely contains more than 1 item, it can
376 /// efficiently be stored as a `SmallVec`.
377 pub suspects: SmallVec<[(ObjectId, Range<u32>); 1]>,
378 /// The *Source File*'s name, in case it differs from *Blamed File*'s name.
379 pub source_file_name: Option<BString>,
380}
381
382impl UnblamedHunk {
383 pub(crate) fn has_suspect(&self, suspect: &ObjectId) -> bool {
384 self.suspects.iter().any(|entry| entry.0 == *suspect)
385 }
386
387 pub(crate) fn get_range(&self, suspect: &ObjectId) -> Option<&Range<u32>> {
388 self.suspects
389 .iter()
390 .find(|entry| entry.0 == *suspect)
391 .map(|entry| &entry.1)
392 }
393}
394
395#[derive(Debug)]
396pub(crate) enum Either<T, U> {
397 Left(T),
398 Right(U),
399}
400
401/// A single change between two blobs, or an unchanged region.
402///
403/// Line numbers refer to the file that is referred to as `after` or `NewOrDestination`, depending
404/// on the context.
405#[derive(Clone, Debug, PartialEq)]
406pub enum Change {
407 /// A range of tokens that wasn't changed.
408 Unchanged(Range<u32>),
409 /// `(added_line_range, num_deleted_in_before)`
410 AddedOrReplaced(Range<u32>, u32),
411 /// `(line_to_start_deletion_at, num_deleted_in_before)`
412 Deleted(u32, u32),
413}