jj_lib/
annotate.rs

1// Copyright 2024 The Jujutsu Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Methods that allow annotation (attribution and blame) for a file in a
16//! repository.
17//!
18//! TODO: Add support for different blame layers with a trait in the future.
19//! Like commit metadata and more.
20
21use std::collections::hash_map;
22use std::collections::HashMap;
23use std::iter;
24use std::ops::Range;
25use std::rc::Rc;
26
27use bstr::BStr;
28use bstr::BString;
29use itertools::Itertools as _;
30use pollster::FutureExt as _;
31
32use crate::backend::BackendError;
33use crate::backend::CommitId;
34use crate::commit::Commit;
35use crate::conflicts::materialize_merge_result_to_bytes;
36use crate::conflicts::materialize_tree_value;
37use crate::conflicts::ConflictMarkerStyle;
38use crate::conflicts::MaterializedTreeValue;
39use crate::diff::Diff;
40use crate::diff::DiffHunkKind;
41use crate::fileset::FilesetExpression;
42use crate::graph::GraphEdge;
43use crate::graph::GraphEdgeType;
44use crate::merged_tree::MergedTree;
45use crate::repo::Repo;
46use crate::repo_path::RepoPath;
47use crate::revset::ResolvedRevsetExpression;
48use crate::revset::RevsetEvaluationError;
49use crate::revset::RevsetExpression;
50use crate::revset::RevsetFilterPredicate;
51use crate::store::Store;
52
53/// Annotation results for a specific file
54#[derive(Clone, Debug)]
55pub struct FileAnnotation {
56    line_map: OriginalLineMap,
57    text: BString,
58}
59
60impl FileAnnotation {
61    /// Returns iterator over `(commit_id, line)`s.
62    ///
63    /// For each line, `Ok(commit_id)` points to the originator commit of the
64    /// line. If no originator commit was found within the domain,
65    /// `Err(commit_id)` should be set. It points to the root (or boundary)
66    /// commit where the line exists.
67    ///
68    /// The `line` includes newline character.
69    pub fn lines(&self) -> impl Iterator<Item = (Result<&CommitId, &CommitId>, &BStr)> {
70        itertools::zip_eq(&self.line_map, self.text.split_inclusive(|b| *b == b'\n'))
71            .map(|(commit_id, line)| (commit_id.as_ref(), line.as_ref()))
72    }
73
74    /// Returns iterator over `(commit_id, line_range)`s.
75    ///
76    /// See [`Self::lines()`] for `commit_id`s.
77    ///
78    /// The `line_range` is a slice range in the file `text`. Consecutive ranges
79    /// having the same `commit_id` are not compacted.
80    pub fn line_ranges(
81        &self,
82    ) -> impl Iterator<Item = (Result<&CommitId, &CommitId>, Range<usize>)> {
83        let ranges = self
84            .text
85            .split_inclusive(|b| *b == b'\n')
86            .scan(0, |total, line| {
87                let start = *total;
88                *total += line.len();
89                Some(start..*total)
90            });
91        itertools::zip_eq(&self.line_map, ranges)
92            .map(|(commit_id, range)| (commit_id.as_ref(), range))
93    }
94
95    /// Returns iterator over compacted `(commit_id, line_range)`s.
96    ///
97    /// Consecutive ranges having the same `commit_id` are merged into one.
98    pub fn compact_line_ranges(
99        &self,
100    ) -> impl Iterator<Item = (Result<&CommitId, &CommitId>, Range<usize>)> {
101        let mut ranges = self.line_ranges();
102        let mut acc = ranges.next();
103        iter::from_fn(move || {
104            let (acc_commit_id, acc_range) = acc.as_mut()?;
105            for (cur_commit_id, cur_range) in ranges.by_ref() {
106                if *acc_commit_id == cur_commit_id {
107                    acc_range.end = cur_range.end;
108                } else {
109                    return acc.replace((cur_commit_id, cur_range));
110                }
111            }
112            acc.take()
113        })
114    }
115
116    /// File content at the starting commit.
117    pub fn text(&self) -> &BStr {
118        self.text.as_ref()
119    }
120}
121
122/// A map from commits to file line mappings and contents.
123type CommitSourceMap = HashMap<CommitId, Source>;
124
125/// Line mapping and file content at a certain commit.
126#[derive(Clone, Debug)]
127struct Source {
128    /// Mapping of line numbers in the file at the current commit to the
129    /// original file, sorted by the line numbers at the current commit.
130    line_map: Vec<(usize, usize)>,
131    /// File content at the current commit.
132    text: BString,
133}
134
135impl Source {
136    fn new(text: BString) -> Self {
137        Source {
138            line_map: Vec::new(),
139            text,
140        }
141    }
142
143    fn load(commit: &Commit, file_path: &RepoPath) -> Result<Self, BackendError> {
144        let tree = commit.tree()?;
145        let text = get_file_contents(commit.store(), file_path, &tree)?;
146        Ok(Self::new(text))
147    }
148
149    fn fill_line_map(&mut self) {
150        let lines = self.text.split_inclusive(|b| *b == b'\n');
151        self.line_map = lines.enumerate().map(|(i, _)| (i, i)).collect();
152    }
153}
154
155/// List of commit IDs that originated lines, indexed by line numbers in the
156/// original file.
157type OriginalLineMap = Vec<Result<CommitId, CommitId>>;
158
159/// Get line by line annotations for a specific file path in the repo.
160///
161/// The `domain` expression narrows the range of ancestors to search. It will be
162/// intersected as `domain & ::starting_commit & files(file_path)`. The
163/// `starting_commit` is assumed to be included in the `domain`.
164///
165/// If the file is not found, returns empty results.
166pub fn get_annotation_for_file(
167    repo: &dyn Repo,
168    starting_commit: &Commit,
169    domain: &Rc<ResolvedRevsetExpression>,
170    file_path: &RepoPath,
171) -> Result<FileAnnotation, RevsetEvaluationError> {
172    let source = Source::load(starting_commit, file_path)?;
173    compute_file_annotation(repo, starting_commit.id(), domain, file_path, source)
174}
175
176/// Get line by line annotations for a specific file path starting with the
177/// given content.
178///
179/// The file content at the `starting_commit` is set to `starting_text`. This is
180/// typically one of the file contents in the conflict or merged-parent tree.
181///
182/// See [`get_annotation_for_file()`] for the other arguments.
183pub fn get_annotation_with_file_content(
184    repo: &dyn Repo,
185    starting_commit_id: &CommitId,
186    domain: &Rc<ResolvedRevsetExpression>,
187    file_path: &RepoPath,
188    starting_text: impl Into<Vec<u8>>,
189) -> Result<FileAnnotation, RevsetEvaluationError> {
190    let source = Source::new(BString::new(starting_text.into()));
191    compute_file_annotation(repo, starting_commit_id, domain, file_path, source)
192}
193
194fn compute_file_annotation(
195    repo: &dyn Repo,
196    starting_commit_id: &CommitId,
197    domain: &Rc<ResolvedRevsetExpression>,
198    file_path: &RepoPath,
199    mut source: Source,
200) -> Result<FileAnnotation, RevsetEvaluationError> {
201    source.fill_line_map();
202    let text = source.text.clone();
203    let line_map = process_commits(repo, starting_commit_id, source, domain, file_path)?;
204    Ok(FileAnnotation { line_map, text })
205}
206
207/// Starting at the starting commit, compute changes at that commit relative to
208/// it's direct parents, updating the mappings as we go. We return the final
209/// original line map that represents where each line of the original came from.
210fn process_commits(
211    repo: &dyn Repo,
212    starting_commit_id: &CommitId,
213    starting_source: Source,
214    domain: &Rc<ResolvedRevsetExpression>,
215    file_name: &RepoPath,
216) -> Result<OriginalLineMap, RevsetEvaluationError> {
217    let predicate = RevsetFilterPredicate::File(FilesetExpression::file_path(file_name.to_owned()));
218    // TODO: If the domain isn't a contiguous range, changes masked out by it
219    // might not be caught by the closest ancestor revision. For example,
220    // domain=merges() would pick up almost nothing because merge revisions
221    // are usually empty. Perhaps, we want to query `files(file_path,
222    // within_sub_graph=domain)`, not `domain & files(file_path)`.
223    let ancestors = RevsetExpression::commit(starting_commit_id.clone()).ancestors();
224    let revset = RevsetExpression::commit(starting_commit_id.clone())
225        .union(&domain.intersection(&ancestors).filtered(predicate))
226        .evaluate(repo)?;
227
228    let mut original_line_map =
229        vec![Err(starting_commit_id.clone()); starting_source.line_map.len()];
230    let mut commit_source_map = HashMap::from([(starting_commit_id.clone(), starting_source)]);
231
232    for node in revset.iter_graph() {
233        let (commit_id, edge_list) = node?;
234        process_commit(
235            repo,
236            file_name,
237            &mut original_line_map,
238            &mut commit_source_map,
239            &commit_id,
240            &edge_list,
241        )?;
242        if commit_source_map.is_empty() {
243            // No more lines to propagate to ancestors.
244            break;
245        }
246    }
247    Ok(original_line_map)
248}
249
250/// For a given commit, for each parent, we compare the version in the parent
251/// tree with the current version, updating the mappings for any lines in
252/// common. If the parent doesn't have the file, we skip it.
253fn process_commit(
254    repo: &dyn Repo,
255    file_name: &RepoPath,
256    original_line_map: &mut OriginalLineMap,
257    commit_source_map: &mut CommitSourceMap,
258    current_commit_id: &CommitId,
259    edges: &[GraphEdge<CommitId>],
260) -> Result<(), BackendError> {
261    let Some(mut current_source) = commit_source_map.remove(current_commit_id) else {
262        return Ok(());
263    };
264
265    for parent_edge in edges {
266        let parent_commit_id = &parent_edge.target;
267        let parent_source = match commit_source_map.entry(parent_commit_id.clone()) {
268            hash_map::Entry::Occupied(entry) => entry.into_mut(),
269            hash_map::Entry::Vacant(entry) => {
270                let commit = repo.store().get_commit(entry.key())?;
271                entry.insert(Source::load(&commit, file_name)?)
272            }
273        };
274
275        // For two versions of the same file, for all the lines in common,
276        // overwrite the new mapping in the results for the new commit. Let's
277        // say I have a file in commit A and commit B. We know that according to
278        // local line_map, in commit A, line 3 corresponds to line 7 of the
279        // original file. Now, line 3 in Commit A corresponds to line 6 in
280        // commit B. Then, we update local line_map to say that "Commit B line 6
281        // goes to line 7 of the original file". We repeat this for all lines in
282        // common in the two commits.
283        let mut current_lines = current_source.line_map.iter().copied().peekable();
284        let mut new_current_line_map = Vec::new();
285        let mut new_parent_line_map = Vec::new();
286        copy_same_lines_with(
287            &current_source.text,
288            &parent_source.text,
289            |current_start, parent_start, count| {
290                new_current_line_map
291                    .extend(current_lines.peeking_take_while(|&(cur, _)| cur < current_start));
292                while let Some((current, original)) =
293                    current_lines.next_if(|&(cur, _)| cur < current_start + count)
294                {
295                    let parent = parent_start + (current - current_start);
296                    new_parent_line_map.push((parent, original));
297                }
298            },
299        );
300        new_current_line_map.extend(current_lines);
301        current_source.line_map = new_current_line_map;
302        parent_source.line_map = if parent_source.line_map.is_empty() {
303            new_parent_line_map
304        } else {
305            itertools::merge(parent_source.line_map.iter().copied(), new_parent_line_map).collect()
306        };
307        // If an omitted parent had the file, leave these lines unresolved. The
308        // origin of the unresolved lines is represented as Err(root_commit_id).
309        if parent_edge.edge_type == GraphEdgeType::Missing {
310            for (_, original_line_number) in parent_source.line_map.drain(..) {
311                original_line_map[original_line_number] = Err(current_commit_id.clone());
312            }
313        }
314        if parent_source.line_map.is_empty() {
315            commit_source_map.remove(parent_commit_id);
316        }
317    }
318
319    // Once we've looked at all parents of a commit, any leftover lines must be
320    // original to the current commit, so we save this information in
321    // original_line_map.
322    for (_, original_line_number) in current_source.line_map {
323        original_line_map[original_line_number] = Ok(current_commit_id.clone());
324    }
325
326    Ok(())
327}
328
329/// For two files, calls `copy(current_start, parent_start, count)` for each
330/// range of contiguous lines in common (e.g. line 8-10 maps to line 9-11.)
331fn copy_same_lines_with(
332    current_contents: &[u8],
333    parent_contents: &[u8],
334    mut copy: impl FnMut(usize, usize, usize),
335) {
336    let diff = Diff::by_line([current_contents, parent_contents]);
337    let mut current_line_counter: usize = 0;
338    let mut parent_line_counter: usize = 0;
339    for hunk in diff.hunks() {
340        match hunk.kind {
341            DiffHunkKind::Matching => {
342                let count = hunk.contents[0].split_inclusive(|b| *b == b'\n').count();
343                copy(current_line_counter, parent_line_counter, count);
344                current_line_counter += count;
345                parent_line_counter += count;
346            }
347            DiffHunkKind::Different => {
348                let current_output = hunk.contents[0];
349                let parent_output = hunk.contents[1];
350                current_line_counter += current_output.split_inclusive(|b| *b == b'\n').count();
351                parent_line_counter += parent_output.split_inclusive(|b| *b == b'\n').count();
352            }
353        }
354    }
355}
356
357fn get_file_contents(
358    store: &Store,
359    path: &RepoPath,
360    tree: &MergedTree,
361) -> Result<BString, BackendError> {
362    let file_value = tree.path_value(path)?;
363    let effective_file_value = materialize_tree_value(store, path, file_value).block_on()?;
364    match effective_file_value {
365        MaterializedTreeValue::File(mut file) => Ok(file.read_all(path)?.into()),
366        MaterializedTreeValue::FileConflict { contents, .. } => Ok(
367            materialize_merge_result_to_bytes(&contents, ConflictMarkerStyle::default()),
368        ),
369        _ => Ok(BString::default()),
370    }
371}
372
373#[cfg(test)]
374mod tests {
375    use super::*;
376
377    #[test]
378    fn test_lines_iterator_empty() {
379        let annotation = FileAnnotation {
380            line_map: vec![],
381            text: "".into(),
382        };
383        assert_eq!(annotation.lines().collect_vec(), vec![]);
384        assert_eq!(annotation.line_ranges().collect_vec(), vec![]);
385        assert_eq!(annotation.compact_line_ranges().collect_vec(), vec![]);
386    }
387
388    #[test]
389    fn test_lines_iterator_with_content() {
390        let commit_id1 = CommitId::from_hex("111111");
391        let commit_id2 = CommitId::from_hex("222222");
392        let commit_id3 = CommitId::from_hex("333333");
393        let annotation = FileAnnotation {
394            line_map: vec![
395                Ok(commit_id1.clone()),
396                Ok(commit_id2.clone()),
397                Ok(commit_id3.clone()),
398            ],
399            text: "foo\n\nbar\n".into(),
400        };
401        assert_eq!(
402            annotation.lines().collect_vec(),
403            vec![
404                (Ok(&commit_id1), "foo\n".as_ref()),
405                (Ok(&commit_id2), "\n".as_ref()),
406                (Ok(&commit_id3), "bar\n".as_ref()),
407            ]
408        );
409        assert_eq!(
410            annotation.line_ranges().collect_vec(),
411            vec![
412                (Ok(&commit_id1), 0..4),
413                (Ok(&commit_id2), 4..5),
414                (Ok(&commit_id3), 5..9),
415            ]
416        );
417        assert_eq!(
418            annotation.compact_line_ranges().collect_vec(),
419            vec![
420                (Ok(&commit_id1), 0..4),
421                (Ok(&commit_id2), 4..5),
422                (Ok(&commit_id3), 5..9),
423            ]
424        );
425    }
426
427    #[test]
428    fn test_lines_iterator_compaction() {
429        let commit_id1 = CommitId::from_hex("111111");
430        let commit_id2 = CommitId::from_hex("222222");
431        let commit_id3 = CommitId::from_hex("333333");
432        let annotation = FileAnnotation {
433            line_map: vec![
434                Ok(commit_id1.clone()),
435                Ok(commit_id1.clone()),
436                Ok(commit_id2.clone()),
437                Ok(commit_id1.clone()),
438                Ok(commit_id3.clone()),
439                Ok(commit_id3.clone()),
440                Ok(commit_id3.clone()),
441            ],
442            text: "\n".repeat(7).into(),
443        };
444        assert_eq!(
445            annotation.compact_line_ranges().collect_vec(),
446            vec![
447                (Ok(&commit_id1), 0..2),
448                (Ok(&commit_id2), 2..3),
449                (Ok(&commit_id1), 3..4),
450                (Ok(&commit_id3), 4..7),
451            ]
452        );
453    }
454}