Skip to main content

jj_lib/
annotate.rs

1// Copyright 2024 The Jujutsu Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Methods that allow annotation (attribution and blame) for a file in a
16//! repository.
17//!
18//! TODO: Add support for different blame layers with a trait in the future.
19//! Like commit metadata and more.
20
21use std::collections::HashMap;
22use std::collections::hash_map;
23use std::iter;
24use std::ops::Range;
25use std::sync::Arc;
26
27use bstr::BStr;
28use bstr::BString;
29use futures::TryStreamExt as _;
30use itertools::Itertools as _;
31
32use crate::backend::BackendError;
33use crate::backend::BackendResult;
34use crate::backend::CommitId;
35use crate::commit::Commit;
36use crate::conflicts::ConflictMarkerStyle;
37use crate::conflicts::ConflictMaterializeOptions;
38use crate::conflicts::MaterializedTreeValue;
39use crate::conflicts::materialize_merge_result_to_bytes;
40use crate::conflicts::materialize_tree_value;
41use crate::diff::ContentDiff;
42use crate::diff::DiffHunkKind;
43use crate::files::FileMergeHunkLevel;
44use crate::fileset::FilesetExpression;
45use crate::graph::GraphEdge;
46use crate::merge::SameChange;
47use crate::merged_tree::MergedTree;
48use crate::repo::Repo;
49use crate::repo_path::RepoPath;
50use crate::repo_path::RepoPathBuf;
51use crate::revset::ResolvedRevsetExpression;
52use crate::revset::RevsetEvaluationError;
53use crate::revset::RevsetExpression;
54use crate::revset::RevsetFilterPredicate;
55use crate::store::Store;
56use crate::tree_merge::MergeOptions;
57
58/// Annotation results for a specific file
59#[derive(Clone, Debug)]
60pub struct FileAnnotation {
61    line_map: OriginalLineMap,
62    text: BString,
63}
64
65impl FileAnnotation {
66    /// Returns iterator over `(line_origin, line)`s.
67    ///
68    /// For each line, `Ok(line_origin)` returns information about the
69    /// originator commit of the line. If no originator commit was found
70    /// within the domain, `Err(line_origin)` should be set. It points to the
71    /// commit outside of the domain where the search stopped.
72    ///
73    /// The `line` includes newline character.
74    pub fn line_origins(&self) -> impl Iterator<Item = (Result<&LineOrigin, &LineOrigin>, &BStr)> {
75        itertools::zip_eq(&self.line_map, self.text.split_inclusive(|b| *b == b'\n'))
76            .map(|(line_origin, line)| (line_origin.as_ref(), line.as_ref()))
77    }
78
79    /// Returns iterator over `(commit_id, line)`s.
80    ///
81    /// For each line, `Ok(commit_id)` points to the originator commit of the
82    /// line. If no originator commit was found within the domain,
83    /// `Err(commit_id)` should be set. It points to the commit outside of the
84    /// domain where the search stopped.
85    ///
86    /// The `line` includes newline character.
87    pub fn lines(&self) -> impl Iterator<Item = (Result<&CommitId, &CommitId>, &BStr)> {
88        itertools::zip_eq(
89            self.commit_ids(),
90            self.text
91                .split_inclusive(|b| *b == b'\n')
92                .map(AsRef::as_ref),
93        )
94    }
95
96    /// Returns iterator over `(commit_id, line_range)`s.
97    ///
98    /// See [`Self::lines()`] for `commit_id`s.
99    ///
100    /// The `line_range` is a slice range in the file `text`. Consecutive ranges
101    /// having the same `commit_id` are not compacted.
102    pub fn line_ranges(
103        &self,
104    ) -> impl Iterator<Item = (Result<&CommitId, &CommitId>, Range<usize>)> {
105        let ranges = self
106            .text
107            .split_inclusive(|b| *b == b'\n')
108            .scan(0, |total, line| {
109                let start = *total;
110                *total += line.len();
111                Some(start..*total)
112            });
113        itertools::zip_eq(self.commit_ids(), ranges)
114    }
115
116    /// Returns iterator over compacted `(commit_id, line_range)`s.
117    ///
118    /// Consecutive ranges having the same `commit_id` are merged into one.
119    pub fn compact_line_ranges(
120        &self,
121    ) -> impl Iterator<Item = (Result<&CommitId, &CommitId>, Range<usize>)> {
122        let mut ranges = self.line_ranges();
123        let mut acc = ranges.next();
124        iter::from_fn(move || {
125            let (acc_commit_id, acc_range) = acc.as_mut()?;
126            for (cur_commit_id, cur_range) in ranges.by_ref() {
127                if *acc_commit_id == cur_commit_id {
128                    acc_range.end = cur_range.end;
129                } else {
130                    return acc.replace((cur_commit_id, cur_range));
131                }
132            }
133            acc.take()
134        })
135    }
136
137    /// File content at the starting commit.
138    pub fn text(&self) -> &BStr {
139        self.text.as_ref()
140    }
141
142    fn commit_ids(&self) -> impl Iterator<Item = Result<&CommitId, &CommitId>> {
143        self.line_map.iter().map(|line_origin| {
144            line_origin
145                .as_ref()
146                .map(|origin| &origin.commit_id)
147                .map_err(|origin| &origin.commit_id)
148        })
149    }
150}
151
152/// Annotation process for a specific file.
153#[derive(Clone, Debug)]
154pub struct FileAnnotator {
155    // If we add copy-tracing support, file_path might be tracked by state.
156    file_path: RepoPathBuf,
157    starting_text: BString,
158    state: AnnotationState,
159}
160
161impl FileAnnotator {
162    /// Initializes annotator for a specific file in the `starting_commit`.
163    ///
164    /// If the file is not found, the result would be empty.
165    pub async fn from_commit(
166        starting_commit: &Commit,
167        file_path: &RepoPath,
168    ) -> BackendResult<Self> {
169        let source = Source::load(starting_commit, file_path).await?;
170        Ok(Self::with_source(starting_commit.id(), file_path, source))
171    }
172
173    /// Initializes annotator for a specific file path starting with the given
174    /// content.
175    ///
176    /// The file content at the `starting_commit` is set to `starting_text`.
177    /// This is typically one of the file contents in the conflict or
178    /// merged-parent tree.
179    pub fn with_file_content(
180        starting_commit_id: &CommitId,
181        file_path: &RepoPath,
182        starting_text: impl Into<Vec<u8>>,
183    ) -> Self {
184        let source = Source::new(BString::new(starting_text.into()));
185        Self::with_source(starting_commit_id, file_path, source)
186    }
187
188    fn with_source(
189        starting_commit_id: &CommitId,
190        file_path: &RepoPath,
191        mut source: Source,
192    ) -> Self {
193        source.fill_line_map();
194        let starting_text = source.text.clone();
195        let state = AnnotationState {
196            original_line_map: (0..source.line_map.len())
197                .map(|line_number| {
198                    Err(LineOrigin {
199                        commit_id: starting_commit_id.clone(),
200                        line_number,
201                    })
202                })
203                .collect(),
204            commit_source_map: HashMap::from([(starting_commit_id.clone(), source)]),
205            num_unresolved_roots: 0,
206        };
207        Self {
208            file_path: file_path.to_owned(),
209            starting_text,
210            state,
211        }
212    }
213
214    /// Computes line-by-line annotation within the `domain`.
215    ///
216    /// The `domain` expression narrows the range of ancestors to search. It
217    /// will be intersected as `domain & ::pending_commits & files(file_path)`.
218    /// The `pending_commits` is assumed to be included in the `domain`.
219    pub async fn compute(
220        &mut self,
221        repo: &dyn Repo,
222        domain: &Arc<ResolvedRevsetExpression>,
223    ) -> Result<(), RevsetEvaluationError> {
224        process_commits(repo, &mut self.state, domain, &self.file_path).await
225    }
226
227    /// Remaining commit ids to visit from.
228    pub fn pending_commits(&self) -> impl Iterator<Item = &CommitId> {
229        self.state.commit_source_map.keys()
230    }
231
232    /// Returns the current state as line-oriented annotation.
233    pub fn to_annotation(&self) -> FileAnnotation {
234        // Just clone the line map. We might want to change the underlying data
235        // model something akin to interleaved delta in order to get annotation
236        // at a certain ancestor commit without recomputing.
237        FileAnnotation {
238            line_map: self.state.original_line_map.clone(),
239            text: self.starting_text.clone(),
240        }
241    }
242}
243
244/// Intermediate state of file annotation.
245#[derive(Clone, Debug)]
246struct AnnotationState {
247    original_line_map: OriginalLineMap,
248    /// Commits to file line mappings and contents.
249    commit_source_map: HashMap<CommitId, Source>,
250    /// Number of unresolved root commits in `commit_source_map`.
251    num_unresolved_roots: usize,
252}
253
254/// Line mapping and file content at a certain commit.
255#[derive(Clone, Debug)]
256struct Source {
257    /// Mapping of line numbers in the file at the current commit to the
258    /// starting file, sorted by the line numbers at the current commit.
259    line_map: Vec<(usize, usize)>,
260    /// File content at the current commit.
261    text: BString,
262}
263
264impl Source {
265    fn new(text: BString) -> Self {
266        Self {
267            line_map: Vec::new(),
268            text,
269        }
270    }
271
272    async fn load(commit: &Commit, file_path: &RepoPath) -> Result<Self, BackendError> {
273        let tree = commit.tree();
274        let text = get_file_contents(commit.store(), file_path, &tree).await?;
275        Ok(Self::new(text))
276    }
277
278    fn fill_line_map(&mut self) {
279        let lines = self.text.split_inclusive(|b| *b == b'\n');
280        self.line_map = lines.enumerate().map(|(i, _)| (i, i)).collect();
281    }
282}
283
284/// List of origins for each line, indexed by line numbers in the
285/// starting file.
286type OriginalLineMap = Vec<Result<LineOrigin, LineOrigin>>;
287
288/// Information about the origin of an annotated line.
289#[derive(Clone, Debug, Eq, PartialEq)]
290pub struct LineOrigin {
291    /// Commit ID where the line was introduced.
292    pub commit_id: CommitId,
293    /// 0-based line number of the line in the origin commit.
294    pub line_number: usize,
295}
296
297/// Starting from the source commits, compute changes at that commit relative to
298/// its direct parents, updating the mappings as we go.
299async fn process_commits(
300    repo: &dyn Repo,
301    state: &mut AnnotationState,
302    domain: &Arc<ResolvedRevsetExpression>,
303    file_name: &RepoPath,
304) -> Result<(), RevsetEvaluationError> {
305    let predicate = RevsetFilterPredicate::File(FilesetExpression::file_path(file_name.to_owned()));
306    // TODO: If the domain isn't a contiguous range, changes masked out by it
307    // might not be caught by the closest ancestor revision. For example,
308    // domain=merges() would pick up almost nothing because merge revisions
309    // are usually empty. Perhaps, we want to query `files(file_path,
310    // within_sub_graph=domain)`, not `domain & files(file_path)`.
311    let heads = RevsetExpression::commits(state.commit_source_map.keys().cloned().collect());
312    let revset = heads
313        .union(&domain.intersection(&heads.ancestors()).filtered(predicate))
314        .evaluate(repo)?;
315
316    state.num_unresolved_roots = 0;
317    let mut nodes = revset.stream_graph();
318    while let Some((commit_id, edge_list)) = nodes.try_next().await? {
319        process_commit(repo, file_name, state, &commit_id, &edge_list).await?;
320        if state.commit_source_map.len() == state.num_unresolved_roots {
321            // No more lines to propagate to ancestors.
322            break;
323        }
324    }
325    Ok(())
326}
327
328/// For a given commit, for each parent, we compare the version in the parent
329/// tree with the current version, updating the mappings for any lines in
330/// common. If the parent doesn't have the file, we skip it.
331async fn process_commit(
332    repo: &dyn Repo,
333    file_name: &RepoPath,
334    state: &mut AnnotationState,
335    current_commit_id: &CommitId,
336    edges: &[GraphEdge<CommitId>],
337) -> Result<(), BackendError> {
338    let Some(mut current_source) = state.commit_source_map.remove(current_commit_id) else {
339        return Ok(());
340    };
341
342    for parent_edge in edges {
343        let parent_commit_id = &parent_edge.target;
344        let parent_source = match state.commit_source_map.entry(parent_commit_id.clone()) {
345            hash_map::Entry::Occupied(entry) => entry.into_mut(),
346            hash_map::Entry::Vacant(entry) => {
347                let commit = repo.store().get_commit_async(entry.key()).await?;
348                entry.insert(Source::load(&commit, file_name).await?)
349            }
350        };
351
352        // For two versions of the same file, for all the lines in common,
353        // overwrite the new mapping in the results for the new commit. Let's
354        // say I have a file in commit A and commit B. We know that according to
355        // local line_map, in commit A, line 3 corresponds to line 7 of the
356        // starting file. Now, line 3 in Commit A corresponds to line 6 in
357        // commit B. Then, we update local line_map to say that "Commit B line 6
358        // goes to line 7 of the starting file". We repeat this for all lines in
359        // common in the two commits.
360        let mut current_lines = current_source.line_map.iter().copied().peekable();
361        let mut new_current_line_map = Vec::new();
362        let mut new_parent_line_map = Vec::new();
363        copy_same_lines_with(
364            &current_source.text,
365            &parent_source.text,
366            |current_start, parent_start, count| {
367                new_current_line_map
368                    .extend(current_lines.peeking_take_while(|&(cur, _)| cur < current_start));
369                while let Some((current, starting)) =
370                    current_lines.next_if(|&(cur, _)| cur < current_start + count)
371                {
372                    let parent = parent_start + (current - current_start);
373                    new_parent_line_map.push((parent, starting));
374                }
375            },
376        );
377        new_current_line_map.extend(current_lines);
378        current_source.line_map = new_current_line_map;
379        parent_source.line_map = if parent_source.line_map.is_empty() {
380            new_parent_line_map
381        } else {
382            itertools::merge(parent_source.line_map.iter().copied(), new_parent_line_map).collect()
383        };
384        if parent_source.line_map.is_empty() {
385            state.commit_source_map.remove(parent_commit_id);
386        } else if parent_edge.is_missing() {
387            // If an omitted parent had the file, leave these lines unresolved.
388            // The origin of the unresolved lines is represented as
389            // Err(LineOrigin { parent_commit_id, parent_line_number }).
390            for &(parent_line_number, starting_line_number) in &parent_source.line_map {
391                state.original_line_map[starting_line_number] = Err(LineOrigin {
392                    commit_id: parent_commit_id.clone(),
393                    line_number: parent_line_number,
394                });
395            }
396            state.num_unresolved_roots += 1;
397        }
398    }
399
400    // Once we've looked at all parents of a commit, any leftover lines must be
401    // original to the current commit, so we save this information in
402    // original_line_map.
403    for (current_line_number, starting_line_number) in current_source.line_map {
404        state.original_line_map[starting_line_number] = Ok(LineOrigin {
405            commit_id: current_commit_id.clone(),
406            line_number: current_line_number,
407        });
408    }
409
410    Ok(())
411}
412
413/// For two files, calls `copy(current_start, parent_start, count)` for each
414/// range of contiguous lines in common (e.g. line 8-10 maps to line 9-11.)
415fn copy_same_lines_with(
416    current_contents: &[u8],
417    parent_contents: &[u8],
418    mut copy: impl FnMut(usize, usize, usize),
419) {
420    let diff = ContentDiff::by_line([current_contents, parent_contents]);
421    let mut current_line_counter: usize = 0;
422    let mut parent_line_counter: usize = 0;
423    for hunk in diff.hunks() {
424        match hunk.kind {
425            DiffHunkKind::Matching => {
426                let count = hunk.contents[0].split_inclusive(|b| *b == b'\n').count();
427                copy(current_line_counter, parent_line_counter, count);
428                current_line_counter += count;
429                parent_line_counter += count;
430            }
431            DiffHunkKind::Different => {
432                let current_output = hunk.contents[0];
433                let parent_output = hunk.contents[1];
434                current_line_counter += current_output.split_inclusive(|b| *b == b'\n').count();
435                parent_line_counter += parent_output.split_inclusive(|b| *b == b'\n').count();
436            }
437        }
438    }
439}
440
441async fn get_file_contents(
442    store: &Store,
443    path: &RepoPath,
444    tree: &MergedTree,
445) -> Result<BString, BackendError> {
446    let file_value = tree.path_value(path).await?;
447    let effective_file_value =
448        materialize_tree_value(store, path, file_value, tree.labels()).await?;
449    match effective_file_value {
450        MaterializedTreeValue::File(mut file) => Ok(file.read_all(path).await?.into()),
451        MaterializedTreeValue::FileConflict(file) => {
452            // TODO: track line origins without materializing
453            let options = ConflictMaterializeOptions {
454                marker_style: ConflictMarkerStyle::Diff,
455                marker_len: None,
456                merge: MergeOptions {
457                    hunk_level: FileMergeHunkLevel::Line,
458                    same_change: SameChange::Accept,
459                },
460            };
461            Ok(materialize_merge_result_to_bytes(
462                &file.contents,
463                &file.labels,
464                &options,
465            ))
466        }
467        _ => Ok(BString::default()),
468    }
469}
470
471#[cfg(test)]
472mod tests {
473    use super::*;
474
475    fn make_line_origin(commit_id: &CommitId, line_number: usize) -> LineOrigin {
476        LineOrigin {
477            commit_id: commit_id.clone(),
478            line_number,
479        }
480    }
481
482    #[test]
483    fn test_lines_iterator_empty() {
484        let annotation = FileAnnotation {
485            line_map: vec![],
486            text: "".into(),
487        };
488        assert_eq!(annotation.line_origins().collect_vec(), vec![]);
489        assert_eq!(annotation.lines().collect_vec(), vec![]);
490        assert_eq!(annotation.line_ranges().collect_vec(), vec![]);
491        assert_eq!(annotation.compact_line_ranges().collect_vec(), vec![]);
492    }
493
494    #[test]
495    fn test_lines_iterator_with_content() {
496        let commit_id1 = CommitId::from_hex("111111");
497        let commit_id2 = CommitId::from_hex("222222");
498        let commit_id3 = CommitId::from_hex("333333");
499        let annotation = FileAnnotation {
500            line_map: vec![
501                Ok(make_line_origin(&commit_id1, 0)),
502                Ok(make_line_origin(&commit_id2, 1)),
503                Ok(make_line_origin(&commit_id3, 2)),
504            ],
505            text: "foo\n\nbar\n".into(),
506        };
507        assert_eq!(
508            annotation.line_origins().collect_vec(),
509            vec![
510                (Ok(&make_line_origin(&commit_id1, 0)), "foo\n".as_ref()),
511                (Ok(&make_line_origin(&commit_id2, 1)), "\n".as_ref()),
512                (Ok(&make_line_origin(&commit_id3, 2)), "bar\n".as_ref()),
513            ]
514        );
515        assert_eq!(
516            annotation.lines().collect_vec(),
517            vec![
518                (Ok(&commit_id1), "foo\n".as_ref()),
519                (Ok(&commit_id2), "\n".as_ref()),
520                (Ok(&commit_id3), "bar\n".as_ref()),
521            ]
522        );
523        assert_eq!(
524            annotation.line_ranges().collect_vec(),
525            vec![
526                (Ok(&commit_id1), 0..4),
527                (Ok(&commit_id2), 4..5),
528                (Ok(&commit_id3), 5..9),
529            ]
530        );
531        assert_eq!(
532            annotation.compact_line_ranges().collect_vec(),
533            vec![
534                (Ok(&commit_id1), 0..4),
535                (Ok(&commit_id2), 4..5),
536                (Ok(&commit_id3), 5..9),
537            ]
538        );
539    }
540
541    #[test]
542    fn test_lines_iterator_compaction() {
543        let commit_id1 = CommitId::from_hex("111111");
544        let commit_id2 = CommitId::from_hex("222222");
545        let commit_id3 = CommitId::from_hex("333333");
546        let annotation = FileAnnotation {
547            line_map: vec![
548                Ok(make_line_origin(&commit_id1, 0)),
549                Ok(make_line_origin(&commit_id1, 1)),
550                Ok(make_line_origin(&commit_id2, 2)),
551                Ok(make_line_origin(&commit_id1, 3)),
552                Ok(make_line_origin(&commit_id3, 4)),
553                Ok(make_line_origin(&commit_id3, 5)),
554                Ok(make_line_origin(&commit_id3, 6)),
555            ],
556            text: "\n".repeat(7).into(),
557        };
558        assert_eq!(
559            annotation.compact_line_ranges().collect_vec(),
560            vec![
561                (Ok(&commit_id1), 0..2),
562                (Ok(&commit_id2), 2..3),
563                (Ok(&commit_id1), 3..4),
564                (Ok(&commit_id3), 4..7),
565            ]
566        );
567    }
568}