jj_lib/
annotate.rs

1// Copyright 2024 The Jujutsu Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Methods that allow annotation (attribution and blame) for a file in a
16//! repository.
17//!
18//! TODO: Add support for different blame layers with a trait in the future.
19//! Like commit metadata and more.
20
21use std::collections::HashMap;
22use std::collections::hash_map;
23use std::iter;
24use std::ops::Range;
25use std::rc::Rc;
26
27use bstr::BStr;
28use bstr::BString;
29use itertools::Itertools as _;
30use pollster::FutureExt as _;
31
32use crate::backend::BackendError;
33use crate::backend::BackendResult;
34use crate::backend::CommitId;
35use crate::commit::Commit;
36use crate::conflicts::ConflictMarkerStyle;
37use crate::conflicts::MaterializedTreeValue;
38use crate::conflicts::materialize_merge_result_to_bytes;
39use crate::conflicts::materialize_tree_value;
40use crate::diff::Diff;
41use crate::diff::DiffHunkKind;
42use crate::fileset::FilesetExpression;
43use crate::graph::GraphEdge;
44use crate::merged_tree::MergedTree;
45use crate::repo::Repo;
46use crate::repo_path::RepoPath;
47use crate::repo_path::RepoPathBuf;
48use crate::revset::ResolvedRevsetExpression;
49use crate::revset::RevsetEvaluationError;
50use crate::revset::RevsetExpression;
51use crate::revset::RevsetFilterPredicate;
52use crate::store::Store;
53
54/// Annotation results for a specific file
55#[derive(Clone, Debug)]
56pub struct FileAnnotation {
57    line_map: OriginalLineMap,
58    text: BString,
59}
60
61impl FileAnnotation {
62    /// Returns iterator over `(line_origin, line)`s.
63    ///
64    /// For each line, `Ok(line_origin)` returns information about the
65    /// originator commit of the line. If no originator commit was found
66    /// within the domain, `Err(line_origin)` should be set. It points to the
67    /// commit outside of the domain where the search stopped.
68    ///
69    /// The `line` includes newline character.
70    pub fn line_origins(&self) -> impl Iterator<Item = (Result<&LineOrigin, &LineOrigin>, &BStr)> {
71        itertools::zip_eq(&self.line_map, self.text.split_inclusive(|b| *b == b'\n'))
72            .map(|(line_origin, line)| (line_origin.as_ref(), line.as_ref()))
73    }
74
75    /// Returns iterator over `(commit_id, line)`s.
76    ///
77    /// For each line, `Ok(commit_id)` points to the originator commit of the
78    /// line. If no originator commit was found within the domain,
79    /// `Err(commit_id)` should be set. It points to the commit outside of the
80    /// domain where the search stopped.
81    ///
82    /// The `line` includes newline character.
83    pub fn lines(&self) -> impl Iterator<Item = (Result<&CommitId, &CommitId>, &BStr)> {
84        itertools::zip_eq(
85            self.commit_ids(),
86            self.text
87                .split_inclusive(|b| *b == b'\n')
88                .map(AsRef::as_ref),
89        )
90    }
91
92    /// Returns iterator over `(commit_id, line_range)`s.
93    ///
94    /// See [`Self::lines()`] for `commit_id`s.
95    ///
96    /// The `line_range` is a slice range in the file `text`. Consecutive ranges
97    /// having the same `commit_id` are not compacted.
98    pub fn line_ranges(
99        &self,
100    ) -> impl Iterator<Item = (Result<&CommitId, &CommitId>, Range<usize>)> {
101        let ranges = self
102            .text
103            .split_inclusive(|b| *b == b'\n')
104            .scan(0, |total, line| {
105                let start = *total;
106                *total += line.len();
107                Some(start..*total)
108            });
109        itertools::zip_eq(self.commit_ids(), ranges)
110    }
111
112    /// Returns iterator over compacted `(commit_id, line_range)`s.
113    ///
114    /// Consecutive ranges having the same `commit_id` are merged into one.
115    pub fn compact_line_ranges(
116        &self,
117    ) -> impl Iterator<Item = (Result<&CommitId, &CommitId>, Range<usize>)> {
118        let mut ranges = self.line_ranges();
119        let mut acc = ranges.next();
120        iter::from_fn(move || {
121            let (acc_commit_id, acc_range) = acc.as_mut()?;
122            for (cur_commit_id, cur_range) in ranges.by_ref() {
123                if *acc_commit_id == cur_commit_id {
124                    acc_range.end = cur_range.end;
125                } else {
126                    return acc.replace((cur_commit_id, cur_range));
127                }
128            }
129            acc.take()
130        })
131    }
132
133    /// File content at the starting commit.
134    pub fn text(&self) -> &BStr {
135        self.text.as_ref()
136    }
137
138    fn commit_ids(&self) -> impl Iterator<Item = Result<&CommitId, &CommitId>> {
139        self.line_map.iter().map(|line_origin| {
140            line_origin
141                .as_ref()
142                .map(|origin| &origin.commit_id)
143                .map_err(|origin| &origin.commit_id)
144        })
145    }
146}
147
148/// Annotation process for a specific file.
149#[derive(Clone, Debug)]
150pub struct FileAnnotator {
151    // If we add copy-tracing support, file_path might be tracked by state.
152    file_path: RepoPathBuf,
153    starting_text: BString,
154    state: AnnotationState,
155}
156
157impl FileAnnotator {
158    /// Initializes annotator for a specific file in the `starting_commit`.
159    ///
160    /// If the file is not found, the result would be empty.
161    pub fn from_commit(starting_commit: &Commit, file_path: &RepoPath) -> BackendResult<Self> {
162        let source = Source::load(starting_commit, file_path)?;
163        Ok(Self::with_source(starting_commit.id(), file_path, source))
164    }
165
166    /// Initializes annotator for a specific file path starting with the given
167    /// content.
168    ///
169    /// The file content at the `starting_commit` is set to `starting_text`.
170    /// This is typically one of the file contents in the conflict or
171    /// merged-parent tree.
172    pub fn with_file_content(
173        starting_commit_id: &CommitId,
174        file_path: &RepoPath,
175        starting_text: impl Into<Vec<u8>>,
176    ) -> Self {
177        let source = Source::new(BString::new(starting_text.into()));
178        Self::with_source(starting_commit_id, file_path, source)
179    }
180
181    fn with_source(
182        starting_commit_id: &CommitId,
183        file_path: &RepoPath,
184        mut source: Source,
185    ) -> Self {
186        source.fill_line_map();
187        let starting_text = source.text.clone();
188        let state = AnnotationState {
189            original_line_map: (0..source.line_map.len())
190                .map(|line_number| {
191                    Err(LineOrigin {
192                        commit_id: starting_commit_id.clone(),
193                        line_number,
194                    })
195                })
196                .collect(),
197            commit_source_map: HashMap::from([(starting_commit_id.clone(), source)]),
198            num_unresolved_roots: 0,
199        };
200        Self {
201            file_path: file_path.to_owned(),
202            starting_text,
203            state,
204        }
205    }
206
207    /// Computes line-by-line annotation within the `domain`.
208    ///
209    /// The `domain` expression narrows the range of ancestors to search. It
210    /// will be intersected as `domain & ::pending_commits & files(file_path)`.
211    /// The `pending_commits` is assumed to be included in the `domain`.
212    pub fn compute(
213        &mut self,
214        repo: &dyn Repo,
215        domain: &Rc<ResolvedRevsetExpression>,
216    ) -> Result<(), RevsetEvaluationError> {
217        process_commits(repo, &mut self.state, domain, &self.file_path)
218    }
219
220    /// Remaining commit ids to visit from.
221    pub fn pending_commits(&self) -> impl Iterator<Item = &CommitId> {
222        self.state.commit_source_map.keys()
223    }
224
225    /// Returns the current state as line-oriented annotation.
226    pub fn to_annotation(&self) -> FileAnnotation {
227        // Just clone the line map. We might want to change the underlying data
228        // model something akin to interleaved delta in order to get annotation
229        // at a certain ancestor commit without recomputing.
230        FileAnnotation {
231            line_map: self.state.original_line_map.clone(),
232            text: self.starting_text.clone(),
233        }
234    }
235}
236
237/// Intermediate state of file annotation.
238#[derive(Clone, Debug)]
239struct AnnotationState {
240    original_line_map: OriginalLineMap,
241    /// Commits to file line mappings and contents.
242    commit_source_map: HashMap<CommitId, Source>,
243    /// Number of unresolved root commits in `commit_source_map`.
244    num_unresolved_roots: usize,
245}
246
247/// Line mapping and file content at a certain commit.
248#[derive(Clone, Debug)]
249struct Source {
250    /// Mapping of line numbers in the file at the current commit to the
251    /// starting file, sorted by the line numbers at the current commit.
252    line_map: Vec<(usize, usize)>,
253    /// File content at the current commit.
254    text: BString,
255}
256
257impl Source {
258    fn new(text: BString) -> Self {
259        Self {
260            line_map: Vec::new(),
261            text,
262        }
263    }
264
265    fn load(commit: &Commit, file_path: &RepoPath) -> Result<Self, BackendError> {
266        let tree = commit.tree()?;
267        let text = get_file_contents(commit.store(), file_path, &tree).block_on()?;
268        Ok(Self::new(text))
269    }
270
271    fn fill_line_map(&mut self) {
272        let lines = self.text.split_inclusive(|b| *b == b'\n');
273        self.line_map = lines.enumerate().map(|(i, _)| (i, i)).collect();
274    }
275}
276
277/// List of origins for each line, indexed by line numbers in the
278/// starting file.
279type OriginalLineMap = Vec<Result<LineOrigin, LineOrigin>>;
280
281/// Information about the origin of an annotated line.
282#[derive(Clone, Debug, Eq, PartialEq)]
283pub struct LineOrigin {
284    /// Commit ID where the line was introduced.
285    pub commit_id: CommitId,
286    /// 0-based line number of the line in the origin commit.
287    pub line_number: usize,
288}
289
290/// Starting from the source commits, compute changes at that commit relative to
291/// its direct parents, updating the mappings as we go.
292fn process_commits(
293    repo: &dyn Repo,
294    state: &mut AnnotationState,
295    domain: &Rc<ResolvedRevsetExpression>,
296    file_name: &RepoPath,
297) -> Result<(), RevsetEvaluationError> {
298    let predicate = RevsetFilterPredicate::File(FilesetExpression::file_path(file_name.to_owned()));
299    // TODO: If the domain isn't a contiguous range, changes masked out by it
300    // might not be caught by the closest ancestor revision. For example,
301    // domain=merges() would pick up almost nothing because merge revisions
302    // are usually empty. Perhaps, we want to query `files(file_path,
303    // within_sub_graph=domain)`, not `domain & files(file_path)`.
304    let heads = RevsetExpression::commits(state.commit_source_map.keys().cloned().collect());
305    let revset = heads
306        .union(&domain.intersection(&heads.ancestors()).filtered(predicate))
307        .evaluate(repo)?;
308
309    state.num_unresolved_roots = 0;
310    for node in revset.iter_graph() {
311        let (commit_id, edge_list) = node?;
312        process_commit(repo, file_name, state, &commit_id, &edge_list)?;
313        if state.commit_source_map.len() == state.num_unresolved_roots {
314            // No more lines to propagate to ancestors.
315            break;
316        }
317    }
318    Ok(())
319}
320
321/// For a given commit, for each parent, we compare the version in the parent
322/// tree with the current version, updating the mappings for any lines in
323/// common. If the parent doesn't have the file, we skip it.
324fn process_commit(
325    repo: &dyn Repo,
326    file_name: &RepoPath,
327    state: &mut AnnotationState,
328    current_commit_id: &CommitId,
329    edges: &[GraphEdge<CommitId>],
330) -> Result<(), BackendError> {
331    let Some(mut current_source) = state.commit_source_map.remove(current_commit_id) else {
332        return Ok(());
333    };
334
335    for parent_edge in edges {
336        let parent_commit_id = &parent_edge.target;
337        let parent_source = match state.commit_source_map.entry(parent_commit_id.clone()) {
338            hash_map::Entry::Occupied(entry) => entry.into_mut(),
339            hash_map::Entry::Vacant(entry) => {
340                let commit = repo.store().get_commit(entry.key())?;
341                entry.insert(Source::load(&commit, file_name)?)
342            }
343        };
344
345        // For two versions of the same file, for all the lines in common,
346        // overwrite the new mapping in the results for the new commit. Let's
347        // say I have a file in commit A and commit B. We know that according to
348        // local line_map, in commit A, line 3 corresponds to line 7 of the
349        // starting file. Now, line 3 in Commit A corresponds to line 6 in
350        // commit B. Then, we update local line_map to say that "Commit B line 6
351        // goes to line 7 of the starting file". We repeat this for all lines in
352        // common in the two commits.
353        let mut current_lines = current_source.line_map.iter().copied().peekable();
354        let mut new_current_line_map = Vec::new();
355        let mut new_parent_line_map = Vec::new();
356        copy_same_lines_with(
357            &current_source.text,
358            &parent_source.text,
359            |current_start, parent_start, count| {
360                new_current_line_map
361                    .extend(current_lines.peeking_take_while(|&(cur, _)| cur < current_start));
362                while let Some((current, starting)) =
363                    current_lines.next_if(|&(cur, _)| cur < current_start + count)
364                {
365                    let parent = parent_start + (current - current_start);
366                    new_parent_line_map.push((parent, starting));
367                }
368            },
369        );
370        new_current_line_map.extend(current_lines);
371        current_source.line_map = new_current_line_map;
372        parent_source.line_map = if parent_source.line_map.is_empty() {
373            new_parent_line_map
374        } else {
375            itertools::merge(parent_source.line_map.iter().copied(), new_parent_line_map).collect()
376        };
377        if parent_source.line_map.is_empty() {
378            state.commit_source_map.remove(parent_commit_id);
379        } else if parent_edge.is_missing() {
380            // If an omitted parent had the file, leave these lines unresolved.
381            // The origin of the unresolved lines is represented as
382            // Err(LineOrigin { parent_commit_id, parent_line_number }).
383            for &(parent_line_number, starting_line_number) in &parent_source.line_map {
384                state.original_line_map[starting_line_number] = Err(LineOrigin {
385                    commit_id: parent_commit_id.clone(),
386                    line_number: parent_line_number,
387                });
388            }
389            state.num_unresolved_roots += 1;
390        }
391    }
392
393    // Once we've looked at all parents of a commit, any leftover lines must be
394    // original to the current commit, so we save this information in
395    // original_line_map.
396    for (current_line_number, starting_line_number) in current_source.line_map {
397        state.original_line_map[starting_line_number] = Ok(LineOrigin {
398            commit_id: current_commit_id.clone(),
399            line_number: current_line_number,
400        });
401    }
402
403    Ok(())
404}
405
406/// For two files, calls `copy(current_start, parent_start, count)` for each
407/// range of contiguous lines in common (e.g. line 8-10 maps to line 9-11.)
408fn copy_same_lines_with(
409    current_contents: &[u8],
410    parent_contents: &[u8],
411    mut copy: impl FnMut(usize, usize, usize),
412) {
413    let diff = Diff::by_line([current_contents, parent_contents]);
414    let mut current_line_counter: usize = 0;
415    let mut parent_line_counter: usize = 0;
416    for hunk in diff.hunks() {
417        match hunk.kind {
418            DiffHunkKind::Matching => {
419                let count = hunk.contents[0].split_inclusive(|b| *b == b'\n').count();
420                copy(current_line_counter, parent_line_counter, count);
421                current_line_counter += count;
422                parent_line_counter += count;
423            }
424            DiffHunkKind::Different => {
425                let current_output = hunk.contents[0];
426                let parent_output = hunk.contents[1];
427                current_line_counter += current_output.split_inclusive(|b| *b == b'\n').count();
428                parent_line_counter += parent_output.split_inclusive(|b| *b == b'\n').count();
429            }
430        }
431    }
432}
433
434async fn get_file_contents(
435    store: &Store,
436    path: &RepoPath,
437    tree: &MergedTree,
438) -> Result<BString, BackendError> {
439    let file_value = tree.path_value_async(path).await?;
440    let effective_file_value = materialize_tree_value(store, path, file_value).await?;
441    match effective_file_value {
442        MaterializedTreeValue::File(mut file) => Ok(file.read_all(path).await?.into()),
443        MaterializedTreeValue::FileConflict(file) => Ok(materialize_merge_result_to_bytes(
444            &file.contents,
445            ConflictMarkerStyle::default(),
446        )),
447        _ => Ok(BString::default()),
448    }
449}
450
451#[cfg(test)]
452mod tests {
453    use super::*;
454
455    fn make_line_origin(commit_id: &CommitId, line_number: usize) -> LineOrigin {
456        LineOrigin {
457            commit_id: commit_id.clone(),
458            line_number,
459        }
460    }
461
462    #[test]
463    fn test_lines_iterator_empty() {
464        let annotation = FileAnnotation {
465            line_map: vec![],
466            text: "".into(),
467        };
468        assert_eq!(annotation.line_origins().collect_vec(), vec![]);
469        assert_eq!(annotation.lines().collect_vec(), vec![]);
470        assert_eq!(annotation.line_ranges().collect_vec(), vec![]);
471        assert_eq!(annotation.compact_line_ranges().collect_vec(), vec![]);
472    }
473
474    #[test]
475    fn test_lines_iterator_with_content() {
476        let commit_id1 = CommitId::from_hex("111111");
477        let commit_id2 = CommitId::from_hex("222222");
478        let commit_id3 = CommitId::from_hex("333333");
479        let annotation = FileAnnotation {
480            line_map: vec![
481                Ok(make_line_origin(&commit_id1, 0)),
482                Ok(make_line_origin(&commit_id2, 1)),
483                Ok(make_line_origin(&commit_id3, 2)),
484            ],
485            text: "foo\n\nbar\n".into(),
486        };
487        assert_eq!(
488            annotation.line_origins().collect_vec(),
489            vec![
490                (Ok(&make_line_origin(&commit_id1, 0)), "foo\n".as_ref()),
491                (Ok(&make_line_origin(&commit_id2, 1)), "\n".as_ref()),
492                (Ok(&make_line_origin(&commit_id3, 2)), "bar\n".as_ref()),
493            ]
494        );
495        assert_eq!(
496            annotation.lines().collect_vec(),
497            vec![
498                (Ok(&commit_id1), "foo\n".as_ref()),
499                (Ok(&commit_id2), "\n".as_ref()),
500                (Ok(&commit_id3), "bar\n".as_ref()),
501            ]
502        );
503        assert_eq!(
504            annotation.line_ranges().collect_vec(),
505            vec![
506                (Ok(&commit_id1), 0..4),
507                (Ok(&commit_id2), 4..5),
508                (Ok(&commit_id3), 5..9),
509            ]
510        );
511        assert_eq!(
512            annotation.compact_line_ranges().collect_vec(),
513            vec![
514                (Ok(&commit_id1), 0..4),
515                (Ok(&commit_id2), 4..5),
516                (Ok(&commit_id3), 5..9),
517            ]
518        );
519    }
520
521    #[test]
522    fn test_lines_iterator_compaction() {
523        let commit_id1 = CommitId::from_hex("111111");
524        let commit_id2 = CommitId::from_hex("222222");
525        let commit_id3 = CommitId::from_hex("333333");
526        let annotation = FileAnnotation {
527            line_map: vec![
528                Ok(make_line_origin(&commit_id1, 0)),
529                Ok(make_line_origin(&commit_id1, 1)),
530                Ok(make_line_origin(&commit_id2, 2)),
531                Ok(make_line_origin(&commit_id1, 3)),
532                Ok(make_line_origin(&commit_id3, 4)),
533                Ok(make_line_origin(&commit_id3, 5)),
534                Ok(make_line_origin(&commit_id3, 6)),
535            ],
536            text: "\n".repeat(7).into(),
537        };
538        assert_eq!(
539            annotation.compact_line_ranges().collect_vec(),
540            vec![
541                (Ok(&commit_id1), 0..2),
542                (Ok(&commit_id2), 2..3),
543                (Ok(&commit_id1), 3..4),
544                (Ok(&commit_id3), 4..7),
545            ]
546        );
547    }
548}