jj_lib/
fix.rs

1// Copyright 2024 The Jujutsu Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! API for transforming file content, for example to apply formatting, and
16//! propagate those changes across revisions.
17
18use std::collections::HashMap;
19use std::collections::HashSet;
20use std::sync::mpsc::channel;
21
22use futures::StreamExt as _;
23use itertools::Itertools as _;
24use jj_lib::backend::BackendError;
25use jj_lib::backend::CommitId;
26use jj_lib::backend::FileId;
27use jj_lib::backend::TreeValue;
28use jj_lib::matchers::Matcher;
29use jj_lib::merged_tree::MergedTree;
30use jj_lib::merged_tree::MergedTreeBuilder;
31use jj_lib::merged_tree::TreeDiffEntry;
32use jj_lib::repo::MutableRepo;
33use jj_lib::repo::Repo as _;
34use jj_lib::repo_path::RepoPathBuf;
35use jj_lib::revset::RevsetExpression;
36use jj_lib::revset::RevsetIteratorExt as _;
37use jj_lib::store::Store;
38use jj_lib::tree::Tree;
39use pollster::FutureExt as _;
40use rayon::iter::IntoParallelIterator as _;
41use rayon::prelude::ParallelIterator as _;
42
43use crate::revset::RevsetEvaluationError;
44
45/// Represents a file whose content may be transformed by a FileFixer.
46// TODO: Add the set of changed line/byte ranges, so those can be passed into code formatters via
47// flags. This will help avoid introducing unrelated changes when working on code with out of date
48// formatting.
49#[derive(Debug, PartialEq, Eq, Hash, Clone)]
50pub struct FileToFix {
51    /// Unique identifier for the file content.
52    pub file_id: FileId,
53
54    /// The path is provided to allow the FileFixer to potentially:
55    ///  - Choose different behaviors for different file names, extensions, etc.
56    ///  - Update parts of the file's content that should be derived from the
57    ///    file's path.
58    pub repo_path: RepoPathBuf,
59}
60
61/// Error fixing files.
62#[derive(Debug, thiserror::Error)]
63pub enum FixError {
64    /// Error while contacting the Backend.
65    #[error(transparent)]
66    Backend(#[from] BackendError),
67    /// Error resolving commit ancestry.
68    #[error(transparent)]
69    RevsetEvaluation(#[from] RevsetEvaluationError),
70    /// Error occurred while reading/writing file content.
71    #[error(transparent)]
72    IO(#[from] std::io::Error),
73    /// Error occurred while processing the file content.
74    #[error(transparent)]
75    FixContent(Box<dyn std::error::Error + Send + Sync>),
76}
77
78/// Fixes a set of files.
79///
80/// Fixing a file is implementation dependent. For example it may format source
81/// code using a code formatter.
82pub trait FileFixer {
83    /// Fixes a set of files. Stores the resulting file content (for modified
84    /// files).
85    ///
86    /// Returns a map describing the subset of `files_to_fix` that resulted in
87    /// changed file content (unchanged files should not be present in the map),
88    /// pointing to the new FileId for the file.
89    ///
90    /// TODO: Better error handling so we can tell the user what went wrong with
91    /// each failed input.
92    fn fix_files<'a>(
93        &mut self,
94        store: &Store,
95        files_to_fix: &'a HashSet<FileToFix>,
96    ) -> Result<HashMap<&'a FileToFix, FileId>, FixError>;
97}
98
99/// Aggregate information about the outcome of the file fixer.
100#[derive(Debug, Default)]
101pub struct FixSummary {
102    /// The commits that were rewritten. Maps old commit id to new commit id.
103    pub rewrites: HashMap<CommitId, CommitId>,
104
105    /// The number of commits that had files that were passed to the file fixer.
106    pub num_checked_commits: i32,
107    /// The number of new commits created due to file content changed by the
108    /// fixer.
109    pub num_fixed_commits: i32,
110}
111
112/// A [FileFixer] that applies fix_fn to each file, in parallel.
113///
114/// The implementation is currently based on [rayon].
115// TODO: Consider switching to futures, or document the decision not to. We
116// don't need threads unless the threads will be doing more than waiting for
117// pipes.
118pub struct ParallelFileFixer<T> {
119    fix_fn: T,
120}
121
122impl<T> ParallelFileFixer<T>
123where
124    T: Fn(&Store, &FileToFix) -> Result<Option<FileId>, FixError> + Sync + Send,
125{
126    /// Creates a ParallelFileFixer.
127    pub fn new(fix_fn: T) -> Self {
128        Self { fix_fn }
129    }
130}
131
132impl<T> FileFixer for ParallelFileFixer<T>
133where
134    T: Fn(&Store, &FileToFix) -> Result<Option<FileId>, FixError> + Sync + Send,
135{
136    /// Applies `fix_fn()` to the inputs and stores the resulting file content.
137    fn fix_files<'a>(
138        &mut self,
139        store: &Store,
140        files_to_fix: &'a HashSet<FileToFix>,
141    ) -> Result<HashMap<&'a FileToFix, FileId>, FixError> {
142        let (updates_tx, updates_rx) = channel();
143        files_to_fix.into_par_iter().try_for_each_init(
144            || updates_tx.clone(),
145            |updates_tx, file_to_fix| -> Result<(), FixError> {
146                let result = (self.fix_fn)(store, file_to_fix)?;
147                match result {
148                    Some(new_file_id) => {
149                        updates_tx.send((file_to_fix, new_file_id)).unwrap();
150                        Ok(())
151                    }
152                    None => Ok(()),
153                }
154            },
155        )?;
156        drop(updates_tx);
157        let mut result = HashMap::new();
158        while let Ok((file_to_fix, new_file_id)) = updates_rx.recv() {
159            result.insert(file_to_fix, new_file_id);
160        }
161        Ok(result)
162    }
163}
164
165/// Updates files with formatting fixes or other changes, using the given
166/// FileFixer.
167///
168/// The primary use case is to apply the results of automatic code formatting
169/// tools to revisions that may not be properly formatted yet. It can also be
170/// used to modify files with other tools like `sed` or `sort`.
171///
172/// After the FileFixer is done, descendants are also updated, which ensures
173/// that the fixes are not lost. This will never result in new conflicts. Files
174/// with existing conflicts are updated on all sides of the conflict, which
175/// can potentially increase or decrease the number of conflict markers.
176pub fn fix_files(
177    root_commits: Vec<CommitId>,
178    matcher: &dyn Matcher,
179    include_unchanged_files: bool,
180    repo_mut: &mut MutableRepo,
181    file_fixer: &mut impl FileFixer,
182) -> Result<FixSummary, FixError> {
183    let mut summary = FixSummary::default();
184
185    // Collect all of the unique `FileToFix`s we're going to use. file_fixer should
186    // be deterministic, and should not consider outside information, so it is
187    // safe to deduplicate inputs that correspond to multiple files or commits.
188    // This is typically more efficient, but it does prevent certain use cases
189    // like providing commit IDs as inputs to be inserted into files. We also
190    // need to record the mapping between files-to-fix and paths/commits, to
191    // efficiently rewrite the commits later.
192    //
193    // If a path is being fixed in a particular commit, it must also be fixed in all
194    // that commit's descendants. We do this as a way of propagating changes,
195    // under the assumption that it is more useful than performing a rebase and
196    // risking merge conflicts. In the case of code formatters, rebasing wouldn't
197    // reliably produce well formatted code anyway. Deduplicating inputs helps
198    // to prevent quadratic growth in the number of tool executions required for
199    // doing this in long chains of commits with disjoint sets of modified files.
200    let commits: Vec<_> = RevsetExpression::commits(root_commits.clone())
201        .descendants()
202        .evaluate(repo_mut)?
203        .iter()
204        .commits(repo_mut.store())
205        .try_collect()?;
206    tracing::debug!(
207        ?root_commits,
208        ?commits,
209        "looking for files to fix in commits:"
210    );
211
212    let mut unique_files_to_fix: HashSet<FileToFix> = HashSet::new();
213    let mut commit_paths: HashMap<CommitId, HashSet<RepoPathBuf>> = HashMap::new();
214    for commit in commits.iter().rev() {
215        let mut paths: HashSet<RepoPathBuf> = HashSet::new();
216
217        // If --include-unchanged-files, we always fix every matching file in the tree.
218        // Otherwise, we fix the matching changed files in this commit, plus any that
219        // were fixed in ancestors, so we don't lose those changes. We do this
220        // instead of rebasing onto those changes, to avoid merge conflicts.
221        let parent_tree = if include_unchanged_files {
222            MergedTree::resolved(Tree::empty(repo_mut.store().clone(), RepoPathBuf::root()))
223        } else {
224            for parent_id in commit.parent_ids() {
225                if let Some(parent_paths) = commit_paths.get(parent_id) {
226                    paths.extend(parent_paths.iter().cloned());
227                }
228            }
229            commit.parent_tree(repo_mut)?
230        };
231        // TODO: handle copy tracking
232        let mut diff_stream = parent_tree.diff_stream(&commit.tree()?, &matcher);
233        async {
234            while let Some(TreeDiffEntry {
235                path: repo_path,
236                values,
237            }) = diff_stream.next().await
238            {
239                let (_before, after) = values?;
240                // Deleted files have no file content to fix, and they have no terms in `after`,
241                // so we don't add any files-to-fix for them. Conflicted files produce one
242                // file-to-fix for each side of the conflict.
243                for term in after.into_iter().flatten() {
244                    // We currently only support fixing the content of normal files, so we skip
245                    // directories and symlinks, and we ignore the executable bit.
246                    if let TreeValue::File {
247                        id,
248                        executable: _,
249                        copy_id: _,
250                    } = term
251                    {
252                        // TODO: Skip the file if its content is larger than some configured size,
253                        // preferably without actually reading it yet.
254                        let file_to_fix = FileToFix {
255                            file_id: id.clone(),
256                            repo_path: repo_path.clone(),
257                        };
258                        unique_files_to_fix.insert(file_to_fix.clone());
259                        paths.insert(repo_path.clone());
260                    }
261                }
262            }
263            Ok::<(), BackendError>(())
264        }
265        .block_on()?;
266
267        commit_paths.insert(commit.id().clone(), paths);
268    }
269
270    tracing::debug!(
271        ?include_unchanged_files,
272        ?unique_files_to_fix,
273        "invoking file fixer on these files:"
274    );
275
276    // Fix all of the chosen inputs.
277    let fixed_file_ids = file_fixer.fix_files(repo_mut.store().as_ref(), &unique_files_to_fix)?;
278    tracing::debug!(?fixed_file_ids, "file fixer fixed these files:");
279
280    // Substitute the fixed file IDs into all of the affected commits. Currently,
281    // fixes cannot delete or rename files, change the executable bit, or modify
282    // other parts of the commit like the description.
283    repo_mut.transform_descendants(root_commits, |mut rewriter| {
284        // TODO: Build the trees in parallel before `transform_descendants()` and only
285        // keep the tree IDs in memory, so we can pass them to the rewriter.
286        let old_commit_id = rewriter.old_commit().id().clone();
287        let repo_paths = commit_paths.get(&old_commit_id).unwrap();
288        let old_tree = rewriter.old_commit().tree()?;
289        let mut tree_builder = MergedTreeBuilder::new(old_tree.id().clone());
290        let mut has_changes = false;
291        for repo_path in repo_paths {
292            let old_value = old_tree.path_value(repo_path)?;
293            let new_value = old_value.map(|old_term| {
294                if let Some(TreeValue::File {
295                    id,
296                    executable,
297                    copy_id,
298                }) = old_term
299                {
300                    let file_to_fix = FileToFix {
301                        file_id: id.clone(),
302                        repo_path: repo_path.clone(),
303                    };
304                    if let Some(new_id) = fixed_file_ids.get(&file_to_fix) {
305                        return Some(TreeValue::File {
306                            id: new_id.clone(),
307                            executable: *executable,
308                            copy_id: copy_id.clone(),
309                        });
310                    }
311                }
312                old_term.clone()
313            });
314            if new_value != old_value {
315                tree_builder.set_or_remove(repo_path.clone(), new_value);
316                has_changes = true;
317            }
318        }
319        summary.num_checked_commits += 1;
320        if has_changes {
321            summary.num_fixed_commits += 1;
322            let new_tree = tree_builder.write_tree(rewriter.mut_repo().store())?;
323            let builder = rewriter.reparent();
324            let new_commit = builder.set_tree_id(new_tree).write()?;
325            summary
326                .rewrites
327                .insert(old_commit_id, new_commit.id().clone());
328        } else if rewriter.parents_changed() {
329            let new_commit = rewriter.reparent().write()?;
330            summary
331                .rewrites
332                .insert(old_commit_id, new_commit.id().clone());
333        }
334        Ok(())
335    })?;
336
337    tracing::debug!(?summary);
338    Ok(summary)
339}