jj_lib/fix.rs
1// Copyright 2024 The Jujutsu Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! API for transforming file content, for example to apply formatting, and
16//! propagate those changes across revisions.
17
18use std::collections::HashMap;
19use std::collections::HashSet;
20use std::sync::mpsc::channel;
21
22use futures::StreamExt as _;
23use itertools::Itertools as _;
24use jj_lib::backend::BackendError;
25use jj_lib::backend::CommitId;
26use jj_lib::backend::FileId;
27use jj_lib::backend::TreeValue;
28use jj_lib::matchers::Matcher;
29use jj_lib::merged_tree::MergedTree;
30use jj_lib::merged_tree::MergedTreeBuilder;
31use jj_lib::merged_tree::TreeDiffEntry;
32use jj_lib::repo::MutableRepo;
33use jj_lib::repo::Repo as _;
34use jj_lib::repo_path::RepoPathBuf;
35use jj_lib::revset::RevsetExpression;
36use jj_lib::revset::RevsetIteratorExt as _;
37use jj_lib::store::Store;
38use jj_lib::tree::Tree;
39use rayon::iter::IntoParallelIterator as _;
40use rayon::prelude::ParallelIterator as _;
41
42use crate::revset::RevsetEvaluationError;
43
44/// Represents a file whose content may be transformed by a FileFixer.
45// TODO: Add the set of changed line/byte ranges, so those can be passed into code formatters via
46// flags. This will help avoid introducing unrelated changes when working on code with out of date
47// formatting.
48#[derive(Debug, PartialEq, Eq, Hash, Clone)]
49pub struct FileToFix {
50 /// Unique identifier for the file content.
51 pub file_id: FileId,
52
53 /// The path is provided to allow the FileFixer to potentially:
54 /// - Choose different behaviors for different file names, extensions, etc.
55 /// - Update parts of the file's content that should be derived from the
56 /// file's path.
57 pub repo_path: RepoPathBuf,
58}
59
60/// Error fixing files.
61#[derive(Debug, thiserror::Error)]
62pub enum FixError {
63 /// Error while contacting the Backend.
64 #[error(transparent)]
65 Backend(#[from] BackendError),
66 /// Error resolving commit ancestry.
67 #[error(transparent)]
68 RevsetEvaluation(#[from] RevsetEvaluationError),
69 /// Error occurred while reading/writing file content.
70 #[error(transparent)]
71 IO(#[from] std::io::Error),
72 /// Error occurred while processing the file content.
73 #[error(transparent)]
74 FixContent(Box<dyn std::error::Error + Send + Sync>),
75}
76
77/// Fixes a set of files.
78///
79/// Fixing a file is implementation dependent. For example it may format source
80/// code using a code formatter.
81pub trait FileFixer {
82 /// Fixes a set of files. Stores the resulting file content (for modified
83 /// files).
84 ///
85 /// Returns a map describing the subset of `files_to_fix` that resulted in
86 /// changed file content (unchanged files should not be present in the map),
87 /// pointing to the new FileId for the file.
88 ///
89 /// TODO: Better error handling so we can tell the user what went wrong with
90 /// each failed input.
91 fn fix_files<'a>(
92 &mut self,
93 store: &Store,
94 files_to_fix: &'a HashSet<FileToFix>,
95 ) -> Result<HashMap<&'a FileToFix, FileId>, FixError>;
96}
97
98/// Aggregate information about the outcome of the file fixer.
99#[derive(Debug, Default)]
100pub struct FixSummary {
101 /// The commits that were rewritten. Maps old commit id to new commit id.
102 pub rewrites: HashMap<CommitId, CommitId>,
103
104 /// The number of commits that had files that were passed to the file fixer.
105 pub num_checked_commits: i32,
106 /// The number of new commits created due to file content changed by the
107 /// fixer.
108 pub num_fixed_commits: i32,
109}
110
111/// A [FileFixer] that applies fix_fn to each file, in parallel.
112///
113/// The implementation is currently based on [rayon].
114// TODO: Consider switching to futures, or document the decision not to. We
115// don't need threads unless the threads will be doing more than waiting for
116// pipes.
117pub struct ParallelFileFixer<T> {
118 fix_fn: T,
119}
120
121impl<T> ParallelFileFixer<T>
122where
123 T: Fn(&Store, &FileToFix) -> Result<Option<FileId>, FixError> + Sync + Send,
124{
125 /// Creates a ParallelFileFixer.
126 pub fn new(fix_fn: T) -> Self {
127 Self { fix_fn }
128 }
129}
130
131impl<T> FileFixer for ParallelFileFixer<T>
132where
133 T: Fn(&Store, &FileToFix) -> Result<Option<FileId>, FixError> + Sync + Send,
134{
135 /// Applies `fix_fn()` to the inputs and stores the resulting file content.
136 fn fix_files<'a>(
137 &mut self,
138 store: &Store,
139 files_to_fix: &'a HashSet<FileToFix>,
140 ) -> Result<HashMap<&'a FileToFix, FileId>, FixError> {
141 let (updates_tx, updates_rx) = channel();
142 files_to_fix.into_par_iter().try_for_each_init(
143 || updates_tx.clone(),
144 |updates_tx, file_to_fix| -> Result<(), FixError> {
145 let result = (self.fix_fn)(store, file_to_fix)?;
146 match result {
147 Some(new_file_id) => {
148 updates_tx.send((file_to_fix, new_file_id)).unwrap();
149 Ok(())
150 }
151 None => Ok(()),
152 }
153 },
154 )?;
155 drop(updates_tx);
156 let mut result = HashMap::new();
157 while let Ok((file_to_fix, new_file_id)) = updates_rx.recv() {
158 result.insert(file_to_fix, new_file_id);
159 }
160 Ok(result)
161 }
162}
163
164/// Updates files with formatting fixes or other changes, using the given
165/// FileFixer.
166///
167/// The primary use case is to apply the results of automatic code formatting
168/// tools to revisions that may not be properly formatted yet. It can also be
169/// used to modify files with other tools like `sed` or `sort`.
170///
171/// After the FileFixer is done, descendants are also updated, which ensures
172/// that the fixes are not lost. This will never result in new conflicts. Files
173/// with existing conflicts are updated on all sides of the conflict, which
174/// can potentially increase or decrease the number of conflict markers.
175pub async fn fix_files(
176 root_commits: Vec<CommitId>,
177 matcher: &dyn Matcher,
178 include_unchanged_files: bool,
179 repo_mut: &mut MutableRepo,
180 file_fixer: &mut impl FileFixer,
181) -> Result<FixSummary, FixError> {
182 let mut summary = FixSummary::default();
183
184 // Collect all of the unique `FileToFix`s we're going to use. file_fixer should
185 // be deterministic, and should not consider outside information, so it is
186 // safe to deduplicate inputs that correspond to multiple files or commits.
187 // This is typically more efficient, but it does prevent certain use cases
188 // like providing commit IDs as inputs to be inserted into files. We also
189 // need to record the mapping between files-to-fix and paths/commits, to
190 // efficiently rewrite the commits later.
191 //
192 // If a path is being fixed in a particular commit, it must also be fixed in all
193 // that commit's descendants. We do this as a way of propagating changes,
194 // under the assumption that it is more useful than performing a rebase and
195 // risking merge conflicts. In the case of code formatters, rebasing wouldn't
196 // reliably produce well formatted code anyway. Deduplicating inputs helps
197 // to prevent quadratic growth in the number of tool executions required for
198 // doing this in long chains of commits with disjoint sets of modified files.
199 let commits: Vec<_> = RevsetExpression::commits(root_commits.clone())
200 .descendants()
201 .evaluate(repo_mut)?
202 .iter()
203 .commits(repo_mut.store())
204 .try_collect()?;
205 tracing::debug!(
206 ?root_commits,
207 ?commits,
208 "looking for files to fix in commits:"
209 );
210
211 let mut unique_files_to_fix: HashSet<FileToFix> = HashSet::new();
212 let mut commit_paths: HashMap<CommitId, HashSet<RepoPathBuf>> = HashMap::new();
213 for commit in commits.iter().rev() {
214 let mut paths: HashSet<RepoPathBuf> = HashSet::new();
215
216 // If --include-unchanged-files, we always fix every matching file in the tree.
217 // Otherwise, we fix the matching changed files in this commit, plus any that
218 // were fixed in ancestors, so we don't lose those changes. We do this
219 // instead of rebasing onto those changes, to avoid merge conflicts.
220 let parent_tree = if include_unchanged_files {
221 MergedTree::resolved(Tree::empty(repo_mut.store().clone(), RepoPathBuf::root()))
222 } else {
223 for parent_id in commit.parent_ids() {
224 if let Some(parent_paths) = commit_paths.get(parent_id) {
225 paths.extend(parent_paths.iter().cloned());
226 }
227 }
228 commit.parent_tree(repo_mut)?
229 };
230 // TODO: handle copy tracking
231 let mut diff_stream = parent_tree.diff_stream(&commit.tree()?, &matcher);
232 while let Some(TreeDiffEntry {
233 path: repo_path,
234 values,
235 }) = diff_stream.next().await
236 {
237 let (_before, after) = values?;
238 // Deleted files have no file content to fix, and they have no terms in `after`,
239 // so we don't add any files-to-fix for them. Conflicted files produce one
240 // file-to-fix for each side of the conflict.
241 for term in after.into_iter().flatten() {
242 // We currently only support fixing the content of normal files, so we skip
243 // directories and symlinks, and we ignore the executable bit.
244 if let TreeValue::File {
245 id,
246 executable: _,
247 copy_id: _,
248 } = term
249 {
250 // TODO: Skip the file if its content is larger than some configured size,
251 // preferably without actually reading it yet.
252 let file_to_fix = FileToFix {
253 file_id: id.clone(),
254 repo_path: repo_path.clone(),
255 };
256 unique_files_to_fix.insert(file_to_fix.clone());
257 paths.insert(repo_path.clone());
258 }
259 }
260 }
261
262 commit_paths.insert(commit.id().clone(), paths);
263 }
264
265 tracing::debug!(
266 ?include_unchanged_files,
267 ?unique_files_to_fix,
268 "invoking file fixer on these files:"
269 );
270
271 // Fix all of the chosen inputs.
272 let fixed_file_ids = file_fixer.fix_files(repo_mut.store().as_ref(), &unique_files_to_fix)?;
273 tracing::debug!(?fixed_file_ids, "file fixer fixed these files:");
274
275 // Substitute the fixed file IDs into all of the affected commits. Currently,
276 // fixes cannot delete or rename files, change the executable bit, or modify
277 // other parts of the commit like the description.
278 repo_mut.transform_descendants(root_commits, |mut rewriter| {
279 // TODO: Build the trees in parallel before `transform_descendants()` and only
280 // keep the tree IDs in memory, so we can pass them to the rewriter.
281 let old_commit_id = rewriter.old_commit().id().clone();
282 let repo_paths = commit_paths.get(&old_commit_id).unwrap();
283 let old_tree = rewriter.old_commit().tree()?;
284 let mut tree_builder = MergedTreeBuilder::new(old_tree.id().clone());
285 let mut has_changes = false;
286 for repo_path in repo_paths {
287 let old_value = old_tree.path_value(repo_path)?;
288 let new_value = old_value.map(|old_term| {
289 if let Some(TreeValue::File {
290 id,
291 executable,
292 copy_id,
293 }) = old_term
294 {
295 let file_to_fix = FileToFix {
296 file_id: id.clone(),
297 repo_path: repo_path.clone(),
298 };
299 if let Some(new_id) = fixed_file_ids.get(&file_to_fix) {
300 return Some(TreeValue::File {
301 id: new_id.clone(),
302 executable: *executable,
303 copy_id: copy_id.clone(),
304 });
305 }
306 }
307 old_term.clone()
308 });
309 if new_value != old_value {
310 tree_builder.set_or_remove(repo_path.clone(), new_value);
311 has_changes = true;
312 }
313 }
314 summary.num_checked_commits += 1;
315 if has_changes {
316 summary.num_fixed_commits += 1;
317 let new_tree = tree_builder.write_tree(rewriter.mut_repo().store())?;
318 let builder = rewriter.reparent();
319 let new_commit = builder.set_tree_id(new_tree).write()?;
320 summary
321 .rewrites
322 .insert(old_commit_id, new_commit.id().clone());
323 } else if rewriter.parents_changed() {
324 let new_commit = rewriter.reparent().write()?;
325 summary
326 .rewrites
327 .insert(old_commit_id, new_commit.id().clone());
328 }
329 Ok(())
330 })?;
331
332 tracing::debug!(?summary);
333 Ok(summary)
334}