jj_lib/
annotate.rs

1// Copyright 2024 The Jujutsu Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Methods that allow annotation (attribution and blame) for a file in a
16//! repository.
17//!
18//! TODO: Add support for different blame layers with a trait in the future.
19//! Like commit metadata and more.
20
21use std::collections::hash_map;
22use std::collections::HashMap;
23use std::iter;
24use std::ops::Range;
25use std::rc::Rc;
26
27use bstr::BStr;
28use bstr::BString;
29use itertools::Itertools as _;
30use pollster::FutureExt;
31
32use crate::backend::BackendError;
33use crate::backend::CommitId;
34use crate::commit::Commit;
35use crate::conflicts::materialize_merge_result_to_bytes;
36use crate::conflicts::materialize_tree_value;
37use crate::conflicts::ConflictMarkerStyle;
38use crate::conflicts::MaterializedTreeValue;
39use crate::diff::Diff;
40use crate::diff::DiffHunkKind;
41use crate::fileset::FilesetExpression;
42use crate::graph::GraphEdge;
43use crate::graph::GraphEdgeType;
44use crate::merged_tree::MergedTree;
45use crate::repo::Repo;
46use crate::repo_path::RepoPath;
47use crate::revset::ResolvedRevsetExpression;
48use crate::revset::RevsetEvaluationError;
49use crate::revset::RevsetExpression;
50use crate::revset::RevsetFilterPredicate;
51use crate::store::Store;
52
53/// Annotation results for a specific file
54#[derive(Clone, Debug)]
55pub struct FileAnnotation {
56    line_map: OriginalLineMap,
57    text: BString,
58}
59
60impl FileAnnotation {
61    /// Returns iterator over `(commit_id, line)`s.
62    ///
63    /// For each line, the `commit_id` points to the originator commit of the
64    /// line. The `line` includes newline character.
65    pub fn lines(&self) -> impl Iterator<Item = (Option<&CommitId>, &BStr)> {
66        itertools::zip_eq(&self.line_map, self.text.split_inclusive(|b| *b == b'\n'))
67            .map(|(commit_id, line)| (commit_id.as_ref(), line.as_ref()))
68    }
69
70    /// Returns iterator over `(commit_id, line_range)`s.
71    ///
72    /// For each line, the `commit_id` points to the originator commit of the
73    /// line. The `line_range` is a slice range in the file `text`. Consecutive
74    /// ranges having the same `commit_id` are not compacted.
75    pub fn line_ranges(&self) -> impl Iterator<Item = (Option<&CommitId>, Range<usize>)> {
76        let ranges = self
77            .text
78            .split_inclusive(|b| *b == b'\n')
79            .scan(0, |total, line| {
80                let start = *total;
81                *total += line.len();
82                Some(start..*total)
83            });
84        itertools::zip_eq(&self.line_map, ranges)
85            .map(|(commit_id, range)| (commit_id.as_ref(), range))
86    }
87
88    /// Returns iterator over compacted `(commit_id, line_range)`s.
89    ///
90    /// Consecutive ranges having the same `commit_id` are merged into one.
91    pub fn compact_line_ranges(&self) -> impl Iterator<Item = (Option<&CommitId>, Range<usize>)> {
92        let mut ranges = self.line_ranges();
93        let mut acc = ranges.next();
94        iter::from_fn(move || {
95            let (acc_commit_id, acc_range) = acc.as_mut()?;
96            for (cur_commit_id, cur_range) in ranges.by_ref() {
97                if *acc_commit_id == cur_commit_id {
98                    acc_range.end = cur_range.end;
99                } else {
100                    return acc.replace((cur_commit_id, cur_range));
101                }
102            }
103            acc.take()
104        })
105    }
106
107    /// File content at the starting commit.
108    pub fn text(&self) -> &BStr {
109        self.text.as_ref()
110    }
111}
112
113/// A map from commits to file line mappings and contents.
114type CommitSourceMap = HashMap<CommitId, Source>;
115
116/// Line mapping and file content at a certain commit.
117#[derive(Clone, Debug)]
118struct Source {
119    /// Mapping of line numbers in the file at the current commit to the
120    /// original file, sorted by the line numbers at the current commit.
121    line_map: Vec<(usize, usize)>,
122    /// File content at the current commit.
123    text: BString,
124}
125
126impl Source {
127    fn new(text: BString) -> Self {
128        Source {
129            line_map: Vec::new(),
130            text,
131        }
132    }
133
134    fn load(commit: &Commit, file_path: &RepoPath) -> Result<Self, BackendError> {
135        let tree = commit.tree()?;
136        let text = get_file_contents(commit.store(), file_path, &tree)?;
137        Ok(Self::new(text))
138    }
139
140    fn fill_line_map(&mut self) {
141        let lines = self.text.split_inclusive(|b| *b == b'\n');
142        self.line_map = lines.enumerate().map(|(i, _)| (i, i)).collect();
143    }
144}
145
146/// List of commit IDs that originated lines, indexed by line numbers in the
147/// original file.
148type OriginalLineMap = Vec<Option<CommitId>>;
149
150/// Get line by line annotations for a specific file path in the repo.
151///
152/// The `domain` expression narrows the range of ancestors to search. It will be
153/// intersected as `domain & ::starting_commit & files(file_path)`. The
154/// `starting_commit` is assumed to be included in the `domain`.
155///
156/// If the file is not found, returns empty results.
157pub fn get_annotation_for_file(
158    repo: &dyn Repo,
159    starting_commit: &Commit,
160    domain: &Rc<ResolvedRevsetExpression>,
161    file_path: &RepoPath,
162) -> Result<FileAnnotation, RevsetEvaluationError> {
163    let source = Source::load(starting_commit, file_path)?;
164    compute_file_annotation(repo, starting_commit.id(), domain, file_path, source)
165}
166
167/// Get line by line annotations for a specific file path starting with the
168/// given content.
169///
170/// The file content at the `starting_commit` is set to `starting_text`. This is
171/// typically one of the file contents in the conflict or merged-parent tree.
172///
173/// See [`get_annotation_for_file()`] for the other arguments.
174pub fn get_annotation_with_file_content(
175    repo: &dyn Repo,
176    starting_commit_id: &CommitId,
177    domain: &Rc<ResolvedRevsetExpression>,
178    file_path: &RepoPath,
179    starting_text: impl Into<Vec<u8>>,
180) -> Result<FileAnnotation, RevsetEvaluationError> {
181    let source = Source::new(BString::new(starting_text.into()));
182    compute_file_annotation(repo, starting_commit_id, domain, file_path, source)
183}
184
185fn compute_file_annotation(
186    repo: &dyn Repo,
187    starting_commit_id: &CommitId,
188    domain: &Rc<ResolvedRevsetExpression>,
189    file_path: &RepoPath,
190    mut source: Source,
191) -> Result<FileAnnotation, RevsetEvaluationError> {
192    source.fill_line_map();
193    let text = source.text.clone();
194    let line_map = process_commits(repo, starting_commit_id, source, domain, file_path)?;
195    Ok(FileAnnotation { line_map, text })
196}
197
198/// Starting at the starting commit, compute changes at that commit relative to
199/// it's direct parents, updating the mappings as we go. We return the final
200/// original line map that represents where each line of the original came from.
201fn process_commits(
202    repo: &dyn Repo,
203    starting_commit_id: &CommitId,
204    starting_source: Source,
205    domain: &Rc<ResolvedRevsetExpression>,
206    file_name: &RepoPath,
207) -> Result<OriginalLineMap, RevsetEvaluationError> {
208    let predicate = RevsetFilterPredicate::File(FilesetExpression::file_path(file_name.to_owned()));
209    // TODO: If the domain isn't a contiguous range, changes masked out by it
210    // might not be caught by the closest ancestor revision. For example,
211    // domain=merges() would pick up almost nothing because merge revisions
212    // are usually empty. Perhaps, we want to query `files(file_path,
213    // within_sub_graph=domain)`, not `domain & files(file_path)`.
214    let ancestors = RevsetExpression::commit(starting_commit_id.clone()).ancestors();
215    let revset = RevsetExpression::commit(starting_commit_id.clone())
216        .union(&domain.intersection(&ancestors).filtered(predicate))
217        .evaluate(repo)?;
218
219    let mut original_line_map = vec![None; starting_source.line_map.len()];
220    let mut commit_source_map = HashMap::from([(starting_commit_id.clone(), starting_source)]);
221
222    for node in revset.iter_graph() {
223        let (commit_id, edge_list) = node?;
224        process_commit(
225            repo,
226            file_name,
227            &mut original_line_map,
228            &mut commit_source_map,
229            &commit_id,
230            &edge_list,
231        )?;
232        if commit_source_map.is_empty() {
233            // No more lines to propagate to ancestors.
234            break;
235        }
236    }
237    Ok(original_line_map)
238}
239
240/// For a given commit, for each parent, we compare the version in the parent
241/// tree with the current version, updating the mappings for any lines in
242/// common. If the parent doesn't have the file, we skip it.
243fn process_commit(
244    repo: &dyn Repo,
245    file_name: &RepoPath,
246    original_line_map: &mut OriginalLineMap,
247    commit_source_map: &mut CommitSourceMap,
248    current_commit_id: &CommitId,
249    edges: &[GraphEdge<CommitId>],
250) -> Result<(), BackendError> {
251    let Some(mut current_source) = commit_source_map.remove(current_commit_id) else {
252        return Ok(());
253    };
254
255    for parent_edge in edges {
256        let parent_commit_id = &parent_edge.target;
257        let parent_source = match commit_source_map.entry(parent_commit_id.clone()) {
258            hash_map::Entry::Occupied(entry) => entry.into_mut(),
259            hash_map::Entry::Vacant(entry) => {
260                let commit = repo.store().get_commit(entry.key())?;
261                entry.insert(Source::load(&commit, file_name)?)
262            }
263        };
264
265        // For two versions of the same file, for all the lines in common,
266        // overwrite the new mapping in the results for the new commit. Let's
267        // say I have a file in commit A and commit B. We know that according to
268        // local line_map, in commit A, line 3 corresponds to line 7 of the
269        // original file. Now, line 3 in Commit A corresponds to line 6 in
270        // commit B. Then, we update local line_map to say that "Commit B line 6
271        // goes to line 7 of the original file". We repeat this for all lines in
272        // common in the two commits.
273        let mut current_lines = current_source.line_map.iter().copied().peekable();
274        let mut new_current_line_map = Vec::new();
275        let mut new_parent_line_map = Vec::new();
276        copy_same_lines_with(
277            &current_source.text,
278            &parent_source.text,
279            |current_start, parent_start, count| {
280                new_current_line_map
281                    .extend(current_lines.peeking_take_while(|&(cur, _)| cur < current_start));
282                while let Some((current, original)) =
283                    current_lines.next_if(|&(cur, _)| cur < current_start + count)
284                {
285                    let parent = parent_start + (current - current_start);
286                    new_parent_line_map.push((parent, original));
287                }
288            },
289        );
290        new_current_line_map.extend(current_lines);
291        current_source.line_map = new_current_line_map;
292        parent_source.line_map = if parent_source.line_map.is_empty() {
293            new_parent_line_map
294        } else {
295            itertools::merge(parent_source.line_map.iter().copied(), new_parent_line_map).collect()
296        };
297        // If an omitted parent had the file, leave these lines unresolved.
298        // TODO: These unresolved lines could be copied to the original_line_map
299        // as Err(commit_id) or something instead of None.
300        if parent_source.line_map.is_empty() || parent_edge.edge_type == GraphEdgeType::Missing {
301            commit_source_map.remove(parent_commit_id);
302        }
303    }
304
305    // Once we've looked at all parents of a commit, any leftover lines must be
306    // original to the current commit, so we save this information in
307    // original_line_map.
308    for (_, original_line_number) in current_source.line_map {
309        original_line_map[original_line_number] = Some(current_commit_id.clone());
310    }
311
312    Ok(())
313}
314
315/// For two files, calls `copy(current_start, parent_start, count)` for each
316/// range of contiguous lines in common (e.g. line 8-10 maps to line 9-11.)
317fn copy_same_lines_with(
318    current_contents: &[u8],
319    parent_contents: &[u8],
320    mut copy: impl FnMut(usize, usize, usize),
321) {
322    let diff = Diff::by_line([current_contents, parent_contents]);
323    let mut current_line_counter: usize = 0;
324    let mut parent_line_counter: usize = 0;
325    for hunk in diff.hunks() {
326        match hunk.kind {
327            DiffHunkKind::Matching => {
328                let count = hunk.contents[0].split_inclusive(|b| *b == b'\n').count();
329                copy(current_line_counter, parent_line_counter, count);
330                current_line_counter += count;
331                parent_line_counter += count;
332            }
333            DiffHunkKind::Different => {
334                let current_output = hunk.contents[0];
335                let parent_output = hunk.contents[1];
336                current_line_counter += current_output.split_inclusive(|b| *b == b'\n').count();
337                parent_line_counter += parent_output.split_inclusive(|b| *b == b'\n').count();
338            }
339        }
340    }
341}
342
343fn get_file_contents(
344    store: &Store,
345    path: &RepoPath,
346    tree: &MergedTree,
347) -> Result<BString, BackendError> {
348    let file_value = tree.path_value(path)?;
349    let effective_file_value = materialize_tree_value(store, path, file_value).block_on()?;
350    match effective_file_value {
351        MaterializedTreeValue::File { mut reader, id, .. } => {
352            let mut file_contents = Vec::new();
353            reader
354                .read_to_end(&mut file_contents)
355                .map_err(|e| BackendError::ReadFile {
356                    path: path.to_owned(),
357                    id,
358                    source: Box::new(e),
359                })?;
360            Ok(file_contents.into())
361        }
362        MaterializedTreeValue::FileConflict { contents, .. } => Ok(
363            materialize_merge_result_to_bytes(&contents, ConflictMarkerStyle::default()),
364        ),
365        _ => Ok(BString::default()),
366    }
367}
368
369#[cfg(test)]
370mod tests {
371    use super::*;
372
373    #[test]
374    fn test_lines_iterator_empty() {
375        let annotation = FileAnnotation {
376            line_map: vec![],
377            text: "".into(),
378        };
379        assert_eq!(annotation.lines().collect_vec(), vec![]);
380        assert_eq!(annotation.line_ranges().collect_vec(), vec![]);
381        assert_eq!(annotation.compact_line_ranges().collect_vec(), vec![]);
382    }
383
384    #[test]
385    fn test_lines_iterator_with_content() {
386        let commit_id1 = CommitId::from_hex("111111");
387        let commit_id2 = CommitId::from_hex("222222");
388        let commit_id3 = CommitId::from_hex("333333");
389        let annotation = FileAnnotation {
390            line_map: vec![
391                Some(commit_id1.clone()),
392                Some(commit_id2.clone()),
393                Some(commit_id3.clone()),
394            ],
395            text: "foo\n\nbar\n".into(),
396        };
397        assert_eq!(
398            annotation.lines().collect_vec(),
399            vec![
400                (Some(&commit_id1), "foo\n".as_ref()),
401                (Some(&commit_id2), "\n".as_ref()),
402                (Some(&commit_id3), "bar\n".as_ref()),
403            ]
404        );
405        assert_eq!(
406            annotation.line_ranges().collect_vec(),
407            vec![
408                (Some(&commit_id1), 0..4),
409                (Some(&commit_id2), 4..5),
410                (Some(&commit_id3), 5..9),
411            ]
412        );
413        assert_eq!(
414            annotation.compact_line_ranges().collect_vec(),
415            vec![
416                (Some(&commit_id1), 0..4),
417                (Some(&commit_id2), 4..5),
418                (Some(&commit_id3), 5..9),
419            ]
420        );
421    }
422
423    #[test]
424    fn test_lines_iterator_compaction() {
425        let commit_id1 = CommitId::from_hex("111111");
426        let commit_id2 = CommitId::from_hex("222222");
427        let commit_id3 = CommitId::from_hex("333333");
428        let annotation = FileAnnotation {
429            line_map: vec![
430                Some(commit_id1.clone()),
431                Some(commit_id1.clone()),
432                Some(commit_id2.clone()),
433                Some(commit_id1.clone()),
434                Some(commit_id3.clone()),
435                Some(commit_id3.clone()),
436                Some(commit_id3.clone()),
437            ],
438            text: "\n".repeat(7).into(),
439        };
440        assert_eq!(
441            annotation.compact_line_ranges().collect_vec(),
442            vec![
443                (Some(&commit_id1), 0..2),
444                (Some(&commit_id2), 2..3),
445                (Some(&commit_id1), 3..4),
446                (Some(&commit_id3), 4..7),
447            ]
448        );
449    }
450}