Skip to main content

jj_lib/
fix.rs

1// Copyright 2024 The Jujutsu Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! API for transforming file content, for example to apply formatting, and
16//! propagate those changes across revisions.
17
18use std::collections::HashMap;
19use std::collections::HashSet;
20use std::sync::mpsc::channel;
21
22use futures::StreamExt as _;
23use futures::TryStreamExt as _;
24use futures::future::try_join_all;
25use indexmap::IndexSet;
26use jj_lib::backend::BackendError;
27use jj_lib::backend::CommitId;
28use jj_lib::backend::FileId;
29use jj_lib::backend::TreeValue;
30use jj_lib::commit::Commit;
31use jj_lib::diff::ContentDiff;
32use jj_lib::diff::DiffHunkKind;
33use jj_lib::matchers::Matcher;
34use jj_lib::merged_tree::TreeDiffEntry;
35use jj_lib::merged_tree_builder::MergedTreeBuilder;
36use jj_lib::repo::MutableRepo;
37use jj_lib::repo::Repo as _;
38use jj_lib::repo_path::RepoPathBuf;
39use jj_lib::revset::RevsetExpression;
40use jj_lib::rewrite::merge_commit_trees;
41use jj_lib::store::Store;
42use rayon::iter::IntoParallelIterator as _;
43use rayon::prelude::ParallelIterator as _;
44
45use crate::revset::RevsetEvaluationError;
46use crate::revset::RevsetStreamExt as _;
47
48/// Represents a file whose content may be transformed by a FileFixer.
49#[derive(Debug, PartialEq, Eq, Hash, Clone)]
50pub struct FileToFix {
51    /// Unique identifier for the file content.
52    pub file_id: FileId,
53
54    /// The base FileId for the file content. We will use this FileId to
55    /// create the diff between the base commit's file content and the current
56    /// commit's file content before the fix.
57    pub base_file_id: Option<FileId>,
58
59    /// The path is provided to allow the FileFixer to potentially:
60    ///  - Choose different behaviors for different file names, extensions, etc.
61    ///  - Update parts of the file's content that should be derived from the
62    ///    file's path.
63    pub repo_path: RepoPathBuf,
64}
65
66/// Error fixing files.
67#[derive(Debug, thiserror::Error)]
68pub enum FixError {
69    /// Error while contacting the Backend.
70    #[error(transparent)]
71    Backend(#[from] BackendError),
72    /// Error resolving commit ancestry.
73    #[error(transparent)]
74    RevsetEvaluation(#[from] RevsetEvaluationError),
75    /// Error occurred while reading/writing file content.
76    #[error(transparent)]
77    Io(#[from] std::io::Error),
78    /// Error occurred while processing the file content.
79    #[error(transparent)]
80    FixContent(Box<dyn std::error::Error + Send + Sync>),
81}
82
83/// Fixes a set of files.
84///
85/// Fixing a file is implementation dependent. For example it may format source
86/// code using a code formatter.
87pub trait FileFixer {
88    /// Fixes a set of files. Stores the resulting file content (for modified
89    /// files).
90    ///
91    /// Returns a map describing the subset of `files_to_fix` that resulted in
92    /// changed file content (unchanged files should not be present in the map),
93    /// pointing to the new FileId for the file.
94    ///
95    /// TODO: Better error handling so we can tell the user what went wrong with
96    /// each failed input.
97    fn fix_files<'a>(
98        &mut self,
99        store: &Store,
100        files_to_fix: &'a HashSet<FileToFix>,
101    ) -> Result<HashMap<&'a FileToFix, FileId>, FixError>;
102}
103
104/// Aggregate information about the outcome of the file fixer.
105#[derive(Debug, Default)]
106pub struct FixSummary {
107    /// The commits that were rewritten. Maps old commit id to new commit id.
108    pub rewrites: HashMap<CommitId, CommitId>,
109
110    /// The number of commits that had files that were passed to the file fixer.
111    pub num_checked_commits: i32,
112    /// The number of new commits created due to file content changed by the
113    /// fixer.
114    pub num_fixed_commits: i32,
115}
116
117/// A [FileFixer] that applies fix_fn to each file, in parallel.
118///
119/// The implementation is currently based on [rayon].
120// TODO: Consider switching to futures, or document the decision not to. We
121// don't need threads unless the threads will be doing more than waiting for
122// pipes.
123pub struct ParallelFileFixer<T> {
124    fix_fn: T,
125}
126
127impl<T> ParallelFileFixer<T>
128where
129    T: Fn(&Store, &FileToFix) -> Result<Option<FileId>, FixError> + Sync + Send,
130{
131    /// Creates a ParallelFileFixer.
132    pub fn new(fix_fn: T) -> Self {
133        Self { fix_fn }
134    }
135}
136
137impl<T> FileFixer for ParallelFileFixer<T>
138where
139    T: Fn(&Store, &FileToFix) -> Result<Option<FileId>, FixError> + Sync + Send,
140{
141    /// Applies `fix_fn()` to the inputs and stores the resulting file content.
142    fn fix_files<'a>(
143        &mut self,
144        store: &Store,
145        files_to_fix: &'a HashSet<FileToFix>,
146    ) -> Result<HashMap<&'a FileToFix, FileId>, FixError> {
147        let (updates_tx, updates_rx) = channel();
148        files_to_fix.into_par_iter().try_for_each_init(
149            || updates_tx.clone(),
150            |updates_tx, file_to_fix| -> Result<(), FixError> {
151                let result = (self.fix_fn)(store, file_to_fix)?;
152                match result {
153                    Some(new_file_id) => {
154                        updates_tx.send((file_to_fix, new_file_id)).unwrap();
155                        Ok(())
156                    }
157                    None => Ok(()),
158                }
159            },
160        )?;
161        drop(updates_tx);
162        let mut result = HashMap::new();
163        while let Ok((file_to_fix, new_file_id)) = updates_rx.recv() {
164            result.insert(file_to_fix, new_file_id);
165        }
166        Ok(result)
167    }
168}
169
170/// Updates files with formatting fixes or other changes, using the given
171/// FileFixer.
172///
173/// The primary use case is to apply the results of automatic code formatting
174/// tools to revisions that may not be properly formatted yet. It can also be
175/// used to modify files with other tools like `sed` or `sort`.
176///
177/// After the FileFixer is done, descendants are also updated, which ensures
178/// that the fixes are not lost. This will never result in new conflicts. Files
179/// with existing conflicts are updated on all sides of the conflict, which
180/// can potentially increase or decrease the number of conflict markers.
181pub async fn fix_files(
182    root_commits: Vec<CommitId>,
183    matcher: &dyn Matcher,
184    include_unchanged_files: bool,
185    repo_mut: &mut MutableRepo,
186    file_fixer: &mut impl FileFixer,
187) -> Result<FixSummary, FixError> {
188    let mut summary = FixSummary::default();
189
190    // Collect all of the unique `FileToFix`s we're going to use. file_fixer should
191    // be deterministic, and should not consider outside information, so it is
192    // safe to deduplicate inputs that correspond to multiple files or commits.
193    // This is typically more efficient, but it does prevent certain use cases
194    // like providing commit IDs as inputs to be inserted into files. We also
195    // need to record the mapping between files-to-fix and paths/commits, to
196    // efficiently rewrite the commits later.
197    //
198    // If a path is being fixed in a particular commit, it must also be fixed in all
199    // that commit's descendants. We do this as a way of propagating changes,
200    // under the assumption that it is more useful than performing a rebase and
201    // risking merge conflicts. In the case of code formatters, rebasing wouldn't
202    // reliably produce well formatted code anyway. Deduplicating inputs helps
203    // to prevent quadratic growth in the number of tool executions required for
204    // doing this in long chains of commits with disjoint sets of modified files.
205    let commits: Vec<_> = RevsetExpression::commits(root_commits.clone())
206        .descendants()
207        .evaluate(repo_mut)?
208        .stream()
209        .commits(repo_mut.store())
210        .try_collect()
211        .await?;
212    tracing::debug!(
213        ?root_commits,
214        ?commits,
215        "looking for files to fix in commits:"
216    );
217
218    // Determine the base commit(s) for each commit.
219    let base_commit_map = get_base_commit_map(&commits).await?;
220
221    // Maps repo paths in a commit to the base FileId for that path.
222    // Even if a file is conflicted in the current commit (having multiple
223    // FileIds at the same path), all sides share the same base FileId (derived
224    // from the first side of the base), so we don't need the current FileId in
225    // the key.
226    let mut base_files: HashMap<(CommitId, RepoPathBuf), FileId> = HashMap::new();
227
228    let mut unique_files_to_fix: HashSet<FileToFix> = HashSet::new();
229    let mut commit_paths: HashMap<CommitId, HashSet<RepoPathBuf>> = HashMap::new();
230    for commit in commits.iter().rev() {
231        let mut paths: HashSet<RepoPathBuf> = HashSet::new();
232
233        // Compute the base tree for the current commit.
234        let mut base_commits = Vec::new();
235        let base_commit_ids = base_commit_map.get(commit.id()).unwrap();
236        for base_commit_id in base_commit_ids {
237            if let Some(base_paths) = commit_paths.get(base_commit_id) {
238                paths.extend(base_paths.iter().cloned());
239            }
240            let base_commit = repo_mut.store().get_commit_async(base_commit_id).await?;
241            base_commits.push(base_commit);
242        }
243        let base_tree = merge_commit_trees(repo_mut, &base_commits).await?;
244
245        // If --include-unchanged-files, we always fix every matching file in the tree.
246        // Otherwise, we fix the matching changed files in this commit, plus any that
247        // were fixed in ancestors, so we don't lose those changes. We do this
248        // instead of rebasing onto those changes, to avoid merge conflicts.
249        let diff_base_tree = if include_unchanged_files {
250            &repo_mut.store().empty_merged_tree()
251        } else {
252            &base_tree
253        };
254
255        // TODO: handle copy tracking
256        let mut diff_stream = diff_base_tree.diff_stream(&commit.tree(), &matcher);
257        while let Some(TreeDiffEntry {
258            path: repo_path,
259            values,
260        }) = diff_stream.next().await
261        {
262            let values = values?;
263            if values.after.is_absent() {
264                continue;
265            }
266            let before = if include_unchanged_files {
267                base_tree.path_value(&repo_path).await?.into_iter().next()
268            } else {
269                values.before.into_iter().next()
270            };
271
272            // Deleted files have no file content to fix, and they have no terms in `after`,
273            // so we don't add any files-to-fix for them. For conflicted files in the base
274            // commit(s), we diff against the first side of the conflict. For conflicted
275            // files in the current commit, we add all sides of the conflict to
276            // the files-to-fix.
277            let before_file_id = if let Some(Some(TreeValue::File {
278                id: before_id,
279                executable: _,
280                copy_id: _,
281            })) = before
282            {
283                base_files.insert((commit.id().clone(), repo_path.clone()), before_id.clone());
284                Some(before_id.clone())
285            } else {
286                None
287            };
288
289            for after_term in values.after {
290                // We currently only support fixing the content of normal files, so we skip
291                // directories and symlinks, and we ignore the executable bit.
292                if let Some(TreeValue::File {
293                    id,
294                    executable: _,
295                    copy_id: _,
296                }) = after_term
297                {
298                    // TODO: Skip the file if its content is larger than some configured size,
299                    // preferably without actually reading it yet.
300                    let file_to_fix = FileToFix {
301                        file_id: id.clone(),
302                        base_file_id: before_file_id.clone(),
303                        repo_path: repo_path.clone(),
304                    };
305                    unique_files_to_fix.insert(file_to_fix.clone());
306                    paths.insert(repo_path.clone());
307                }
308            }
309        }
310        commit_paths.insert(commit.id().clone(), paths);
311    }
312
313    tracing::debug!(
314        ?include_unchanged_files,
315        ?unique_files_to_fix,
316        "invoking file fixer on these files:"
317    );
318
319    // Fix all of the chosen inputs.
320    let fixed_file_ids = file_fixer.fix_files(repo_mut.store().as_ref(), &unique_files_to_fix)?;
321    tracing::debug!(?fixed_file_ids, "file fixer fixed these files:");
322
323    // Substitute the fixed file IDs into all of the affected commits. Currently,
324    // fixes cannot delete or rename files, change the executable bit, or modify
325    // other parts of the commit like the description.
326    repo_mut
327        .transform_descendants(root_commits, async |rewriter| {
328            // TODO: Build the trees in parallel before `transform_descendants()` and only
329            // keep the tree IDs in memory, so we can pass them to the rewriter.
330            let old_commit_id = rewriter.old_commit().id().clone();
331            let repo_paths = commit_paths.get(&old_commit_id).unwrap();
332            let old_tree = rewriter.old_commit().tree();
333            let mut tree_builder = MergedTreeBuilder::new(old_tree.clone());
334            let mut has_changes = false;
335            for repo_path in repo_paths {
336                let old_value = old_tree.path_value(repo_path).await?;
337                let base_file_id = base_files.get(&(old_commit_id.clone(), repo_path.clone()));
338                let new_value = old_value.map(|old_term| {
339                    if let Some(TreeValue::File {
340                        id,
341                        executable,
342                        copy_id,
343                    }) = old_term
344                    {
345                        let file_to_fix = FileToFix {
346                            file_id: id.clone(),
347                            base_file_id: base_file_id.cloned(),
348                            repo_path: repo_path.clone(),
349                        };
350                        if let Some(new_id) = fixed_file_ids.get(&file_to_fix) {
351                            return Some(TreeValue::File {
352                                id: new_id.clone(),
353                                executable: *executable,
354                                copy_id: copy_id.clone(),
355                            });
356                        }
357                    }
358                    old_term.clone()
359                });
360                if new_value != old_value {
361                    tree_builder.set_or_remove(repo_path.clone(), new_value);
362                    has_changes = true;
363                }
364            }
365            summary.num_checked_commits += 1;
366            if has_changes {
367                summary.num_fixed_commits += 1;
368                let new_tree = tree_builder.write_tree().await?;
369                let builder = rewriter.reparent();
370                let new_commit = builder.set_tree(new_tree).write().await?;
371                summary
372                    .rewrites
373                    .insert(old_commit_id, new_commit.id().clone());
374            } else if rewriter.parents_changed() {
375                let new_commit = rewriter.reparent().write().await?;
376                summary
377                    .rewrites
378                    .insert(old_commit_id, new_commit.id().clone());
379            }
380            Ok(())
381        })
382        .await?;
383
384    tracing::debug!(?summary);
385    Ok(summary)
386}
387
388/// Representation of different ranges formatters can use to emit diff ranges.
389#[derive(Debug, PartialEq, Eq)]
390pub enum RegionsToFormat {
391    /// Line ranges (1-based, inclusive [first, last]).
392    LineRanges(Vec<LineRange>),
393}
394
395/// A formattable range of lines or bytes.
396#[derive(Debug, PartialEq, Eq)]
397pub struct FormatRange {
398    /// The first (inclusive) of the range.
399    pub first: usize,
400    /// The last (inclusive) of the range.
401    pub last: usize,
402}
403
404impl FormatRange {
405    /// Creates a new `FormatRange`.
406    pub fn new(first: usize, last: usize) -> Self {
407        Self { first, last }
408    }
409}
410
411/// A line range (1-based, inclusive [first, last]).
412pub type LineRange = FormatRange;
413
414/// Computes the 1-based line ranges in `current` that are different from
415/// `base`. The ranges produced can be empty.
416pub fn compute_changed_ranges(base: &[u8], current: &[u8]) -> RegionsToFormat {
417    let mut ranges: Vec<LineRange> = Vec::new();
418    if current.is_empty() {
419        return RegionsToFormat::LineRanges(ranges);
420    }
421
422    let diff = ContentDiff::by_line([base, current]);
423    let mut current_line = 1;
424    for hunk in diff.hunks() {
425        let line_count = compute_file_line_count(hunk.contents[1]);
426        match hunk.kind {
427            DiffHunkKind::Matching => {}
428            DiffHunkKind::Different => {
429                if line_count > 0 {
430                    // We want the diff ranges to be 1-based and inclusive [first, last] as this
431                    // is what most formatters expect.
432                    ranges.push(LineRange {
433                        first: current_line,
434                        last: current_line + line_count - 1,
435                    });
436                }
437            }
438        }
439        current_line += line_count;
440    }
441
442    RegionsToFormat::LineRanges(ranges)
443}
444
445/// Computes the number of lines in a byte slice (i.e. a file).
446pub fn compute_file_line_count(text: &[u8]) -> usize {
447    let line_count = text.iter().filter(|&&b| b == b'\n').count();
448    let extra = if !text.is_empty() && !text.ends_with(b"\n") {
449        1
450    } else {
451        0
452    };
453    line_count + extra
454}
455
456/// Given a vector of commits, determine the base commit(s) for each of the
457/// commits in the vector.
458///
459/// Notes:
460/// - `commits` must be sorted in reverse topological order (children before
461///   parents).
462/// - The returned base commits are the closest ancestors for each commit that
463///   are not in `commits`. They may include ancestors of other base commits.
464///
465/// The current commit will diff against the base commit(s) to determine the
466/// modified files that need to be `jj fix`ed. We also use these base commits to
467/// compute modified lines by diffing the file content in the current commit
468/// against the file content in the base commit(s).
469///
470/// This is public only for testing purposes.
471pub async fn get_base_commit_map(
472    commits: &[Commit],
473) -> Result<HashMap<CommitId, IndexSet<CommitId>>, FixError> {
474    let commit_ids: HashSet<&CommitId> = commits.iter().map(|c| c.id()).collect();
475    let parents_lists = try_join_all(commits.iter().map(|c| c.parents())).await?;
476    let base_commit_ids: HashSet<CommitId> = parents_lists
477        .into_iter()
478        .flatten()
479        .filter(|parent| !commit_ids.contains(parent.id()))
480        .map(|base_commit| base_commit.id().clone())
481        .collect();
482
483    // Build a map of each commit to its "base commits" (closest ancestors not in
484    // `commits`).
485    //
486    // We process commits in topological order (parents before children) so that
487    // we can propagate the base commits from parents to children. Note that the
488    // `commits` vector is in reverse topological order, so we iterate in reverse.
489    let mut base_commit_map: HashMap<CommitId, IndexSet<CommitId>> = HashMap::new();
490    for commit in commits.iter().rev() {
491        let mut parent_commit_ids: IndexSet<CommitId> = IndexSet::new();
492
493        for parent_id in commit.parent_ids() {
494            if let Some(parent_bases) = base_commit_map.get(parent_id) {
495                parent_commit_ids.extend(parent_bases.iter().cloned());
496            }
497            if base_commit_ids.contains(parent_id) {
498                parent_commit_ids.insert(parent_id.clone());
499            }
500        }
501        base_commit_map.insert(commit.id().clone(), parent_commit_ids);
502    }
503
504    Ok(base_commit_map)
505}
506
507#[cfg(test)]
508mod tests {
509    use super::*;
510
511    #[test]
512    fn test_compute_file_line_count() {
513        assert_eq!(compute_file_line_count(b""), 0);
514        assert_eq!(compute_file_line_count(b"a"), 1);
515        assert_eq!(compute_file_line_count(b"a\n"), 1);
516        assert_eq!(compute_file_line_count(b"a\nb"), 2);
517        assert_eq!(compute_file_line_count(b"a\nb\n"), 2);
518    }
519}