jj_lib/
annotate.rs

1// Copyright 2024 The Jujutsu Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Methods that allow annotation (attribution and blame) for a file in a
16//! repository.
17//!
18//! TODO: Add support for different blame layers with a trait in the future.
19//! Like commit metadata and more.
20
21use std::collections::hash_map;
22use std::collections::HashMap;
23use std::iter;
24use std::ops::Range;
25use std::rc::Rc;
26
27use bstr::BStr;
28use bstr::BString;
29use itertools::Itertools as _;
30use pollster::FutureExt as _;
31
32use crate::backend::BackendError;
33use crate::backend::BackendResult;
34use crate::backend::CommitId;
35use crate::commit::Commit;
36use crate::conflicts::materialize_merge_result_to_bytes;
37use crate::conflicts::materialize_tree_value;
38use crate::conflicts::ConflictMarkerStyle;
39use crate::conflicts::MaterializedTreeValue;
40use crate::diff::Diff;
41use crate::diff::DiffHunkKind;
42use crate::fileset::FilesetExpression;
43use crate::graph::GraphEdge;
44use crate::graph::GraphEdgeType;
45use crate::merged_tree::MergedTree;
46use crate::repo::Repo;
47use crate::repo_path::RepoPath;
48use crate::repo_path::RepoPathBuf;
49use crate::revset::ResolvedRevsetExpression;
50use crate::revset::RevsetEvaluationError;
51use crate::revset::RevsetExpression;
52use crate::revset::RevsetFilterPredicate;
53use crate::store::Store;
54
55/// Annotation results for a specific file
56#[derive(Clone, Debug)]
57pub struct FileAnnotation {
58    line_map: OriginalLineMap,
59    text: BString,
60}
61
62impl FileAnnotation {
63    /// Returns iterator over `(commit_id, line)`s.
64    ///
65    /// For each line, `Ok(commit_id)` points to the originator commit of the
66    /// line. If no originator commit was found within the domain,
67    /// `Err(commit_id)` should be set. It points to the root (or boundary)
68    /// commit where the line exists.
69    ///
70    /// The `line` includes newline character.
71    pub fn lines(&self) -> impl Iterator<Item = (Result<&CommitId, &CommitId>, &BStr)> {
72        itertools::zip_eq(&self.line_map, self.text.split_inclusive(|b| *b == b'\n'))
73            .map(|(commit_id, line)| (commit_id.as_ref(), line.as_ref()))
74    }
75
76    /// Returns iterator over `(commit_id, line_range)`s.
77    ///
78    /// See [`Self::lines()`] for `commit_id`s.
79    ///
80    /// The `line_range` is a slice range in the file `text`. Consecutive ranges
81    /// having the same `commit_id` are not compacted.
82    pub fn line_ranges(
83        &self,
84    ) -> impl Iterator<Item = (Result<&CommitId, &CommitId>, Range<usize>)> {
85        let ranges = self
86            .text
87            .split_inclusive(|b| *b == b'\n')
88            .scan(0, |total, line| {
89                let start = *total;
90                *total += line.len();
91                Some(start..*total)
92            });
93        itertools::zip_eq(&self.line_map, ranges)
94            .map(|(commit_id, range)| (commit_id.as_ref(), range))
95    }
96
97    /// Returns iterator over compacted `(commit_id, line_range)`s.
98    ///
99    /// Consecutive ranges having the same `commit_id` are merged into one.
100    pub fn compact_line_ranges(
101        &self,
102    ) -> impl Iterator<Item = (Result<&CommitId, &CommitId>, Range<usize>)> {
103        let mut ranges = self.line_ranges();
104        let mut acc = ranges.next();
105        iter::from_fn(move || {
106            let (acc_commit_id, acc_range) = acc.as_mut()?;
107            for (cur_commit_id, cur_range) in ranges.by_ref() {
108                if *acc_commit_id == cur_commit_id {
109                    acc_range.end = cur_range.end;
110                } else {
111                    return acc.replace((cur_commit_id, cur_range));
112                }
113            }
114            acc.take()
115        })
116    }
117
118    /// File content at the starting commit.
119    pub fn text(&self) -> &BStr {
120        self.text.as_ref()
121    }
122}
123
124/// Annotation process for a specific file.
125#[derive(Clone, Debug)]
126pub struct FileAnnotator {
127    // If we add copy-tracing support, file_path might be tracked by state.
128    file_path: RepoPathBuf,
129    original_text: BString,
130    state: AnnotationState,
131}
132
133impl FileAnnotator {
134    /// Initializes annotator for a specific file in the `starting_commit`.
135    ///
136    /// If the file is not found, the result would be empty.
137    pub fn from_commit(starting_commit: &Commit, file_path: &RepoPath) -> BackendResult<Self> {
138        let source = Source::load(starting_commit, file_path)?;
139        Ok(Self::with_source(starting_commit.id(), file_path, source))
140    }
141
142    /// Initializes annotator for a specific file path starting with the given
143    /// content.
144    ///
145    /// The file content at the `starting_commit` is set to `starting_text`.
146    /// This is typically one of the file contents in the conflict or
147    /// merged-parent tree.
148    pub fn with_file_content(
149        starting_commit_id: &CommitId,
150        file_path: &RepoPath,
151        starting_text: impl Into<Vec<u8>>,
152    ) -> Self {
153        let source = Source::new(BString::new(starting_text.into()));
154        Self::with_source(starting_commit_id, file_path, source)
155    }
156
157    fn with_source(
158        starting_commit_id: &CommitId,
159        file_path: &RepoPath,
160        mut source: Source,
161    ) -> Self {
162        source.fill_line_map();
163        let original_text = source.text.clone();
164        let state = AnnotationState {
165            original_line_map: vec![Err(starting_commit_id.clone()); source.line_map.len()],
166            commit_source_map: HashMap::from([(starting_commit_id.clone(), source)]),
167            num_unresolved_roots: 0,
168        };
169        FileAnnotator {
170            file_path: file_path.to_owned(),
171            original_text,
172            state,
173        }
174    }
175
176    /// Computes line-by-line annotation within the `domain`.
177    ///
178    /// The `domain` expression narrows the range of ancestors to search. It
179    /// will be intersected as `domain & ::pending_commits & files(file_path)`.
180    /// The `pending_commits` is assumed to be included in the `domain`.
181    pub fn compute(
182        &mut self,
183        repo: &dyn Repo,
184        domain: &Rc<ResolvedRevsetExpression>,
185    ) -> Result<(), RevsetEvaluationError> {
186        process_commits(repo, &mut self.state, domain, &self.file_path)
187    }
188
189    /// Remaining commit ids to visit from.
190    pub fn pending_commits(&self) -> impl Iterator<Item = &CommitId> {
191        self.state.commit_source_map.keys()
192    }
193
194    /// Returns the current state as line-oriented annotation.
195    pub fn to_annotation(&self) -> FileAnnotation {
196        // Just clone the line map. We might want to change the underlying data
197        // model something akin to interleaved delta in order to get annotation
198        // at a certain ancestor commit without recomputing.
199        FileAnnotation {
200            line_map: self.state.original_line_map.clone(),
201            text: self.original_text.clone(),
202        }
203    }
204}
205
206/// Intermediate state of file annotation.
207#[derive(Clone, Debug)]
208struct AnnotationState {
209    original_line_map: OriginalLineMap,
210    /// Commits to file line mappings and contents.
211    commit_source_map: HashMap<CommitId, Source>,
212    /// Number of unresolved root commits in `commit_source_map`.
213    num_unresolved_roots: usize,
214}
215
216/// Line mapping and file content at a certain commit.
217#[derive(Clone, Debug)]
218struct Source {
219    /// Mapping of line numbers in the file at the current commit to the
220    /// original file, sorted by the line numbers at the current commit.
221    line_map: Vec<(usize, usize)>,
222    /// File content at the current commit.
223    text: BString,
224}
225
226impl Source {
227    fn new(text: BString) -> Self {
228        Source {
229            line_map: Vec::new(),
230            text,
231        }
232    }
233
234    fn load(commit: &Commit, file_path: &RepoPath) -> Result<Self, BackendError> {
235        let tree = commit.tree()?;
236        let text = get_file_contents(commit.store(), file_path, &tree).block_on()?;
237        Ok(Self::new(text))
238    }
239
240    fn fill_line_map(&mut self) {
241        let lines = self.text.split_inclusive(|b| *b == b'\n');
242        self.line_map = lines.enumerate().map(|(i, _)| (i, i)).collect();
243    }
244}
245
246/// List of commit IDs that originated lines, indexed by line numbers in the
247/// original file.
248type OriginalLineMap = Vec<Result<CommitId, CommitId>>;
249
250/// Starting from the source commits, compute changes at that commit relative to
251/// its direct parents, updating the mappings as we go.
252fn process_commits(
253    repo: &dyn Repo,
254    state: &mut AnnotationState,
255    domain: &Rc<ResolvedRevsetExpression>,
256    file_name: &RepoPath,
257) -> Result<(), RevsetEvaluationError> {
258    let predicate = RevsetFilterPredicate::File(FilesetExpression::file_path(file_name.to_owned()));
259    // TODO: If the domain isn't a contiguous range, changes masked out by it
260    // might not be caught by the closest ancestor revision. For example,
261    // domain=merges() would pick up almost nothing because merge revisions
262    // are usually empty. Perhaps, we want to query `files(file_path,
263    // within_sub_graph=domain)`, not `domain & files(file_path)`.
264    let heads = RevsetExpression::commits(state.commit_source_map.keys().cloned().collect());
265    let revset = heads
266        .union(&domain.intersection(&heads.ancestors()).filtered(predicate))
267        .evaluate(repo)?;
268
269    state.num_unresolved_roots = 0;
270    for node in revset.iter_graph() {
271        let (commit_id, edge_list) = node?;
272        process_commit(repo, file_name, state, &commit_id, &edge_list)?;
273        if state.commit_source_map.len() == state.num_unresolved_roots {
274            // No more lines to propagate to ancestors.
275            break;
276        }
277    }
278    Ok(())
279}
280
281/// For a given commit, for each parent, we compare the version in the parent
282/// tree with the current version, updating the mappings for any lines in
283/// common. If the parent doesn't have the file, we skip it.
284fn process_commit(
285    repo: &dyn Repo,
286    file_name: &RepoPath,
287    state: &mut AnnotationState,
288    current_commit_id: &CommitId,
289    edges: &[GraphEdge<CommitId>],
290) -> Result<(), BackendError> {
291    let Some(mut current_source) = state.commit_source_map.remove(current_commit_id) else {
292        return Ok(());
293    };
294
295    for parent_edge in edges {
296        let parent_commit_id = &parent_edge.target;
297        let parent_source = match state.commit_source_map.entry(parent_commit_id.clone()) {
298            hash_map::Entry::Occupied(entry) => entry.into_mut(),
299            hash_map::Entry::Vacant(entry) => {
300                let commit = repo.store().get_commit(entry.key())?;
301                entry.insert(Source::load(&commit, file_name)?)
302            }
303        };
304
305        // For two versions of the same file, for all the lines in common,
306        // overwrite the new mapping in the results for the new commit. Let's
307        // say I have a file in commit A and commit B. We know that according to
308        // local line_map, in commit A, line 3 corresponds to line 7 of the
309        // original file. Now, line 3 in Commit A corresponds to line 6 in
310        // commit B. Then, we update local line_map to say that "Commit B line 6
311        // goes to line 7 of the original file". We repeat this for all lines in
312        // common in the two commits.
313        let mut current_lines = current_source.line_map.iter().copied().peekable();
314        let mut new_current_line_map = Vec::new();
315        let mut new_parent_line_map = Vec::new();
316        copy_same_lines_with(
317            &current_source.text,
318            &parent_source.text,
319            |current_start, parent_start, count| {
320                new_current_line_map
321                    .extend(current_lines.peeking_take_while(|&(cur, _)| cur < current_start));
322                while let Some((current, original)) =
323                    current_lines.next_if(|&(cur, _)| cur < current_start + count)
324                {
325                    let parent = parent_start + (current - current_start);
326                    new_parent_line_map.push((parent, original));
327                }
328            },
329        );
330        new_current_line_map.extend(current_lines);
331        current_source.line_map = new_current_line_map;
332        parent_source.line_map = if parent_source.line_map.is_empty() {
333            new_parent_line_map
334        } else {
335            itertools::merge(parent_source.line_map.iter().copied(), new_parent_line_map).collect()
336        };
337        if parent_source.line_map.is_empty() {
338            state.commit_source_map.remove(parent_commit_id);
339        } else if parent_edge.edge_type == GraphEdgeType::Missing {
340            // If an omitted parent had the file, leave these lines unresolved.
341            // The origin of the unresolved lines is represented as
342            // Err(root_commit_id).
343            for &(_, original_line_number) in &parent_source.line_map {
344                state.original_line_map[original_line_number] = Err(current_commit_id.clone());
345            }
346            state.num_unresolved_roots += 1;
347        }
348    }
349
350    // Once we've looked at all parents of a commit, any leftover lines must be
351    // original to the current commit, so we save this information in
352    // original_line_map.
353    for (_, original_line_number) in current_source.line_map {
354        state.original_line_map[original_line_number] = Ok(current_commit_id.clone());
355    }
356
357    Ok(())
358}
359
360/// For two files, calls `copy(current_start, parent_start, count)` for each
361/// range of contiguous lines in common (e.g. line 8-10 maps to line 9-11.)
362fn copy_same_lines_with(
363    current_contents: &[u8],
364    parent_contents: &[u8],
365    mut copy: impl FnMut(usize, usize, usize),
366) {
367    let diff = Diff::by_line([current_contents, parent_contents]);
368    let mut current_line_counter: usize = 0;
369    let mut parent_line_counter: usize = 0;
370    for hunk in diff.hunks() {
371        match hunk.kind {
372            DiffHunkKind::Matching => {
373                let count = hunk.contents[0].split_inclusive(|b| *b == b'\n').count();
374                copy(current_line_counter, parent_line_counter, count);
375                current_line_counter += count;
376                parent_line_counter += count;
377            }
378            DiffHunkKind::Different => {
379                let current_output = hunk.contents[0];
380                let parent_output = hunk.contents[1];
381                current_line_counter += current_output.split_inclusive(|b| *b == b'\n').count();
382                parent_line_counter += parent_output.split_inclusive(|b| *b == b'\n').count();
383            }
384        }
385    }
386}
387
388async fn get_file_contents(
389    store: &Store,
390    path: &RepoPath,
391    tree: &MergedTree,
392) -> Result<BString, BackendError> {
393    let file_value = tree.path_value(path)?;
394    let effective_file_value = materialize_tree_value(store, path, file_value).await?;
395    match effective_file_value {
396        MaterializedTreeValue::File(mut file) => Ok(file.read_all(path).await?.into()),
397        MaterializedTreeValue::FileConflict(file) => Ok(materialize_merge_result_to_bytes(
398            &file.contents,
399            ConflictMarkerStyle::default(),
400        )),
401        _ => Ok(BString::default()),
402    }
403}
404
405#[cfg(test)]
406mod tests {
407    use super::*;
408
409    #[test]
410    fn test_lines_iterator_empty() {
411        let annotation = FileAnnotation {
412            line_map: vec![],
413            text: "".into(),
414        };
415        assert_eq!(annotation.lines().collect_vec(), vec![]);
416        assert_eq!(annotation.line_ranges().collect_vec(), vec![]);
417        assert_eq!(annotation.compact_line_ranges().collect_vec(), vec![]);
418    }
419
420    #[test]
421    fn test_lines_iterator_with_content() {
422        let commit_id1 = CommitId::from_hex("111111");
423        let commit_id2 = CommitId::from_hex("222222");
424        let commit_id3 = CommitId::from_hex("333333");
425        let annotation = FileAnnotation {
426            line_map: vec![
427                Ok(commit_id1.clone()),
428                Ok(commit_id2.clone()),
429                Ok(commit_id3.clone()),
430            ],
431            text: "foo\n\nbar\n".into(),
432        };
433        assert_eq!(
434            annotation.lines().collect_vec(),
435            vec![
436                (Ok(&commit_id1), "foo\n".as_ref()),
437                (Ok(&commit_id2), "\n".as_ref()),
438                (Ok(&commit_id3), "bar\n".as_ref()),
439            ]
440        );
441        assert_eq!(
442            annotation.line_ranges().collect_vec(),
443            vec![
444                (Ok(&commit_id1), 0..4),
445                (Ok(&commit_id2), 4..5),
446                (Ok(&commit_id3), 5..9),
447            ]
448        );
449        assert_eq!(
450            annotation.compact_line_ranges().collect_vec(),
451            vec![
452                (Ok(&commit_id1), 0..4),
453                (Ok(&commit_id2), 4..5),
454                (Ok(&commit_id3), 5..9),
455            ]
456        );
457    }
458
459    #[test]
460    fn test_lines_iterator_compaction() {
461        let commit_id1 = CommitId::from_hex("111111");
462        let commit_id2 = CommitId::from_hex("222222");
463        let commit_id3 = CommitId::from_hex("333333");
464        let annotation = FileAnnotation {
465            line_map: vec![
466                Ok(commit_id1.clone()),
467                Ok(commit_id1.clone()),
468                Ok(commit_id2.clone()),
469                Ok(commit_id1.clone()),
470                Ok(commit_id3.clone()),
471                Ok(commit_id3.clone()),
472                Ok(commit_id3.clone()),
473            ],
474            text: "\n".repeat(7).into(),
475        };
476        assert_eq!(
477            annotation.compact_line_ranges().collect_vec(),
478            vec![
479                (Ok(&commit_id1), 0..2),
480                (Ok(&commit_id2), 2..3),
481                (Ok(&commit_id1), 3..4),
482                (Ok(&commit_id3), 4..7),
483            ]
484        );
485    }
486}