jj_lib/fix.rs
1// Copyright 2024 The Jujutsu Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! API for transforming file content, for example to apply formatting, and
16//! propagate those changes across revisions.
17
18use std::collections::HashMap;
19use std::collections::HashSet;
20use std::sync::mpsc::channel;
21
22use futures::StreamExt as _;
23use futures::TryStreamExt as _;
24use futures::future::try_join_all;
25use indexmap::IndexSet;
26use jj_lib::backend::BackendError;
27use jj_lib::backend::CommitId;
28use jj_lib::backend::FileId;
29use jj_lib::backend::TreeValue;
30use jj_lib::commit::Commit;
31use jj_lib::diff::ContentDiff;
32use jj_lib::diff::DiffHunkKind;
33use jj_lib::matchers::Matcher;
34use jj_lib::merged_tree::TreeDiffEntry;
35use jj_lib::merged_tree_builder::MergedTreeBuilder;
36use jj_lib::repo::MutableRepo;
37use jj_lib::repo::Repo as _;
38use jj_lib::repo_path::RepoPathBuf;
39use jj_lib::revset::RevsetExpression;
40use jj_lib::rewrite::merge_commit_trees;
41use jj_lib::store::Store;
42use rayon::iter::IntoParallelIterator as _;
43use rayon::prelude::ParallelIterator as _;
44
45use crate::revset::RevsetEvaluationError;
46use crate::revset::RevsetStreamExt as _;
47
48/// Represents a file whose content may be transformed by a FileFixer.
49#[derive(Debug, PartialEq, Eq, Hash, Clone)]
50pub struct FileToFix {
51 /// Unique identifier for the file content.
52 pub file_id: FileId,
53
54 /// The base FileId for the file content. We will use this FileId to
55 /// create the diff between the base commit's file content and the current
56 /// commit's file content before the fix.
57 pub base_file_id: Option<FileId>,
58
59 /// The path is provided to allow the FileFixer to potentially:
60 /// - Choose different behaviors for different file names, extensions, etc.
61 /// - Update parts of the file's content that should be derived from the
62 /// file's path.
63 pub repo_path: RepoPathBuf,
64}
65
66/// Error fixing files.
67#[derive(Debug, thiserror::Error)]
68pub enum FixError {
69 /// Error while contacting the Backend.
70 #[error(transparent)]
71 Backend(#[from] BackendError),
72 /// Error resolving commit ancestry.
73 #[error(transparent)]
74 RevsetEvaluation(#[from] RevsetEvaluationError),
75 /// Error occurred while reading/writing file content.
76 #[error(transparent)]
77 Io(#[from] std::io::Error),
78 /// Error occurred while processing the file content.
79 #[error(transparent)]
80 FixContent(Box<dyn std::error::Error + Send + Sync>),
81}
82
83/// Fixes a set of files.
84///
85/// Fixing a file is implementation dependent. For example it may format source
86/// code using a code formatter.
87pub trait FileFixer {
88 /// Fixes a set of files. Stores the resulting file content (for modified
89 /// files).
90 ///
91 /// Returns a map describing the subset of `files_to_fix` that resulted in
92 /// changed file content (unchanged files should not be present in the map),
93 /// pointing to the new FileId for the file.
94 ///
95 /// TODO: Better error handling so we can tell the user what went wrong with
96 /// each failed input.
97 fn fix_files<'a>(
98 &mut self,
99 store: &Store,
100 files_to_fix: &'a HashSet<FileToFix>,
101 ) -> Result<HashMap<&'a FileToFix, FileId>, FixError>;
102}
103
104/// Aggregate information about the outcome of the file fixer.
105#[derive(Debug, Default)]
106pub struct FixSummary {
107 /// The commits that were rewritten. Maps old commit id to new commit id.
108 pub rewrites: HashMap<CommitId, CommitId>,
109
110 /// The number of commits that had files that were passed to the file fixer.
111 pub num_checked_commits: i32,
112 /// The number of new commits created due to file content changed by the
113 /// fixer.
114 pub num_fixed_commits: i32,
115}
116
117/// A [FileFixer] that applies fix_fn to each file, in parallel.
118///
119/// The implementation is currently based on [rayon].
120// TODO: Consider switching to futures, or document the decision not to. We
121// don't need threads unless the threads will be doing more than waiting for
122// pipes.
123pub struct ParallelFileFixer<T> {
124 fix_fn: T,
125}
126
127impl<T> ParallelFileFixer<T>
128where
129 T: Fn(&Store, &FileToFix) -> Result<Option<FileId>, FixError> + Sync + Send,
130{
131 /// Creates a ParallelFileFixer.
132 pub fn new(fix_fn: T) -> Self {
133 Self { fix_fn }
134 }
135}
136
137impl<T> FileFixer for ParallelFileFixer<T>
138where
139 T: Fn(&Store, &FileToFix) -> Result<Option<FileId>, FixError> + Sync + Send,
140{
141 /// Applies `fix_fn()` to the inputs and stores the resulting file content.
142 fn fix_files<'a>(
143 &mut self,
144 store: &Store,
145 files_to_fix: &'a HashSet<FileToFix>,
146 ) -> Result<HashMap<&'a FileToFix, FileId>, FixError> {
147 let (updates_tx, updates_rx) = channel();
148 files_to_fix.into_par_iter().try_for_each_init(
149 || updates_tx.clone(),
150 |updates_tx, file_to_fix| -> Result<(), FixError> {
151 let result = (self.fix_fn)(store, file_to_fix)?;
152 match result {
153 Some(new_file_id) => {
154 updates_tx.send((file_to_fix, new_file_id)).unwrap();
155 Ok(())
156 }
157 None => Ok(()),
158 }
159 },
160 )?;
161 drop(updates_tx);
162 let mut result = HashMap::new();
163 while let Ok((file_to_fix, new_file_id)) = updates_rx.recv() {
164 result.insert(file_to_fix, new_file_id);
165 }
166 Ok(result)
167 }
168}
169
170/// Updates files with formatting fixes or other changes, using the given
171/// FileFixer.
172///
173/// The primary use case is to apply the results of automatic code formatting
174/// tools to revisions that may not be properly formatted yet. It can also be
175/// used to modify files with other tools like `sed` or `sort`.
176///
177/// After the FileFixer is done, descendants are also updated, which ensures
178/// that the fixes are not lost. This will never result in new conflicts. Files
179/// with existing conflicts are updated on all sides of the conflict, which
180/// can potentially increase or decrease the number of conflict markers.
181pub async fn fix_files(
182 root_commits: Vec<CommitId>,
183 matcher: &dyn Matcher,
184 include_unchanged_files: bool,
185 repo_mut: &mut MutableRepo,
186 file_fixer: &mut impl FileFixer,
187) -> Result<FixSummary, FixError> {
188 let mut summary = FixSummary::default();
189
190 // Collect all of the unique `FileToFix`s we're going to use. file_fixer should
191 // be deterministic, and should not consider outside information, so it is
192 // safe to deduplicate inputs that correspond to multiple files or commits.
193 // This is typically more efficient, but it does prevent certain use cases
194 // like providing commit IDs as inputs to be inserted into files. We also
195 // need to record the mapping between files-to-fix and paths/commits, to
196 // efficiently rewrite the commits later.
197 //
198 // If a path is being fixed in a particular commit, it must also be fixed in all
199 // that commit's descendants. We do this as a way of propagating changes,
200 // under the assumption that it is more useful than performing a rebase and
201 // risking merge conflicts. In the case of code formatters, rebasing wouldn't
202 // reliably produce well formatted code anyway. Deduplicating inputs helps
203 // to prevent quadratic growth in the number of tool executions required for
204 // doing this in long chains of commits with disjoint sets of modified files.
205 let commits: Vec<_> = RevsetExpression::commits(root_commits.clone())
206 .descendants()
207 .evaluate(repo_mut)?
208 .stream()
209 .commits(repo_mut.store())
210 .try_collect()
211 .await?;
212 tracing::debug!(
213 ?root_commits,
214 ?commits,
215 "looking for files to fix in commits:"
216 );
217
218 // Determine the base commit(s) for each commit.
219 let base_commit_map = get_base_commit_map(&commits).await?;
220
221 // Maps repo paths in a commit to the base FileId for that path.
222 // Even if a file is conflicted in the current commit (having multiple
223 // FileIds at the same path), all sides share the same base FileId (derived
224 // from the first side of the base), so we don't need the current FileId in
225 // the key.
226 let mut base_files: HashMap<(CommitId, RepoPathBuf), FileId> = HashMap::new();
227
228 let mut unique_files_to_fix: HashSet<FileToFix> = HashSet::new();
229 let mut commit_paths: HashMap<CommitId, HashSet<RepoPathBuf>> = HashMap::new();
230 for commit in commits.iter().rev() {
231 let mut paths: HashSet<RepoPathBuf> = HashSet::new();
232
233 // Compute the base tree for the current commit.
234 let mut base_commits = Vec::new();
235 let base_commit_ids = base_commit_map.get(commit.id()).unwrap();
236 for base_commit_id in base_commit_ids {
237 if let Some(base_paths) = commit_paths.get(base_commit_id) {
238 paths.extend(base_paths.iter().cloned());
239 }
240 let base_commit = repo_mut.store().get_commit_async(base_commit_id).await?;
241 base_commits.push(base_commit);
242 }
243 let base_tree = merge_commit_trees(repo_mut, &base_commits).await?;
244
245 // If --include-unchanged-files, we always fix every matching file in the tree.
246 // Otherwise, we fix the matching changed files in this commit, plus any that
247 // were fixed in ancestors, so we don't lose those changes. We do this
248 // instead of rebasing onto those changes, to avoid merge conflicts.
249 let diff_base_tree = if include_unchanged_files {
250 &repo_mut.store().empty_merged_tree()
251 } else {
252 &base_tree
253 };
254
255 // TODO: handle copy tracking
256 let mut diff_stream = diff_base_tree.diff_stream(&commit.tree(), &matcher);
257 while let Some(TreeDiffEntry {
258 path: repo_path,
259 values,
260 }) = diff_stream.next().await
261 {
262 let values = values?;
263 if values.after.is_absent() {
264 continue;
265 }
266 let before = if include_unchanged_files {
267 base_tree.path_value(&repo_path).await?.into_iter().next()
268 } else {
269 values.before.into_iter().next()
270 };
271
272 // Deleted files have no file content to fix, and they have no terms in `after`,
273 // so we don't add any files-to-fix for them. For conflicted files in the base
274 // commit(s), we diff against the first side of the conflict. For conflicted
275 // files in the current commit, we add all sides of the conflict to
276 // the files-to-fix.
277 let before_file_id = if let Some(Some(TreeValue::File {
278 id: before_id,
279 executable: _,
280 copy_id: _,
281 })) = before
282 {
283 base_files.insert((commit.id().clone(), repo_path.clone()), before_id.clone());
284 Some(before_id.clone())
285 } else {
286 None
287 };
288
289 for after_term in values.after {
290 // We currently only support fixing the content of normal files, so we skip
291 // directories and symlinks, and we ignore the executable bit.
292 if let Some(TreeValue::File {
293 id,
294 executable: _,
295 copy_id: _,
296 }) = after_term
297 {
298 // TODO: Skip the file if its content is larger than some configured size,
299 // preferably without actually reading it yet.
300 let file_to_fix = FileToFix {
301 file_id: id.clone(),
302 base_file_id: before_file_id.clone(),
303 repo_path: repo_path.clone(),
304 };
305 unique_files_to_fix.insert(file_to_fix.clone());
306 paths.insert(repo_path.clone());
307 }
308 }
309 }
310 commit_paths.insert(commit.id().clone(), paths);
311 }
312
313 tracing::debug!(
314 ?include_unchanged_files,
315 ?unique_files_to_fix,
316 "invoking file fixer on these files:"
317 );
318
319 // Fix all of the chosen inputs.
320 let fixed_file_ids = file_fixer.fix_files(repo_mut.store().as_ref(), &unique_files_to_fix)?;
321 tracing::debug!(?fixed_file_ids, "file fixer fixed these files:");
322
323 // Substitute the fixed file IDs into all of the affected commits. Currently,
324 // fixes cannot delete or rename files, change the executable bit, or modify
325 // other parts of the commit like the description.
326 repo_mut
327 .transform_descendants(root_commits, async |rewriter| {
328 // TODO: Build the trees in parallel before `transform_descendants()` and only
329 // keep the tree IDs in memory, so we can pass them to the rewriter.
330 let old_commit_id = rewriter.old_commit().id().clone();
331 let repo_paths = commit_paths.get(&old_commit_id).unwrap();
332 let old_tree = rewriter.old_commit().tree();
333 let mut tree_builder = MergedTreeBuilder::new(old_tree.clone());
334 let mut has_changes = false;
335 for repo_path in repo_paths {
336 let old_value = old_tree.path_value(repo_path).await?;
337 let base_file_id = base_files.get(&(old_commit_id.clone(), repo_path.clone()));
338 let new_value = old_value.map(|old_term| {
339 if let Some(TreeValue::File {
340 id,
341 executable,
342 copy_id,
343 }) = old_term
344 {
345 let file_to_fix = FileToFix {
346 file_id: id.clone(),
347 base_file_id: base_file_id.cloned(),
348 repo_path: repo_path.clone(),
349 };
350 if let Some(new_id) = fixed_file_ids.get(&file_to_fix) {
351 return Some(TreeValue::File {
352 id: new_id.clone(),
353 executable: *executable,
354 copy_id: copy_id.clone(),
355 });
356 }
357 }
358 old_term.clone()
359 });
360 if new_value != old_value {
361 tree_builder.set_or_remove(repo_path.clone(), new_value);
362 has_changes = true;
363 }
364 }
365 summary.num_checked_commits += 1;
366 if has_changes {
367 summary.num_fixed_commits += 1;
368 let new_tree = tree_builder.write_tree().await?;
369 let builder = rewriter.reparent();
370 let new_commit = builder.set_tree(new_tree).write().await?;
371 summary
372 .rewrites
373 .insert(old_commit_id, new_commit.id().clone());
374 } else if rewriter.parents_changed() {
375 let new_commit = rewriter.reparent().write().await?;
376 summary
377 .rewrites
378 .insert(old_commit_id, new_commit.id().clone());
379 }
380 Ok(())
381 })
382 .await?;
383
384 tracing::debug!(?summary);
385 Ok(summary)
386}
387
388/// Representation of different ranges formatters can use to emit diff ranges.
389#[derive(Debug, PartialEq, Eq)]
390pub enum RegionsToFormat {
391 /// Line ranges (1-based, inclusive [first, last]).
392 LineRanges(Vec<LineRange>),
393}
394
395/// A formattable range of lines or bytes.
396#[derive(Debug, PartialEq, Eq)]
397pub struct FormatRange {
398 /// The first (inclusive) of the range.
399 pub first: usize,
400 /// The last (inclusive) of the range.
401 pub last: usize,
402}
403
404impl FormatRange {
405 /// Creates a new `FormatRange`.
406 pub fn new(first: usize, last: usize) -> Self {
407 Self { first, last }
408 }
409}
410
411/// A line range (1-based, inclusive [first, last]).
412pub type LineRange = FormatRange;
413
414/// Computes the 1-based line ranges in `current` that are different from
415/// `base`. The ranges produced can be empty.
416pub fn compute_changed_ranges(base: &[u8], current: &[u8]) -> RegionsToFormat {
417 let mut ranges: Vec<LineRange> = Vec::new();
418 if current.is_empty() {
419 return RegionsToFormat::LineRanges(ranges);
420 }
421
422 let diff = ContentDiff::by_line([base, current]);
423 let mut current_line = 1;
424 for hunk in diff.hunks() {
425 let line_count = compute_file_line_count(hunk.contents[1]);
426 match hunk.kind {
427 DiffHunkKind::Matching => {}
428 DiffHunkKind::Different => {
429 if line_count > 0 {
430 // We want the diff ranges to be 1-based and inclusive [first, last] as this
431 // is what most formatters expect.
432 ranges.push(LineRange {
433 first: current_line,
434 last: current_line + line_count - 1,
435 });
436 }
437 }
438 }
439 current_line += line_count;
440 }
441
442 RegionsToFormat::LineRanges(ranges)
443}
444
445/// Computes the number of lines in a byte slice (i.e. a file).
446pub fn compute_file_line_count(text: &[u8]) -> usize {
447 let line_count = text.iter().filter(|&&b| b == b'\n').count();
448 let extra = if !text.is_empty() && !text.ends_with(b"\n") {
449 1
450 } else {
451 0
452 };
453 line_count + extra
454}
455
456/// Given a vector of commits, determine the base commit(s) for each of the
457/// commits in the vector.
458///
459/// Notes:
460/// - `commits` must be sorted in reverse topological order (children before
461/// parents).
462/// - The returned base commits are the closest ancestors for each commit that
463/// are not in `commits`. They may include ancestors of other base commits.
464///
465/// The current commit will diff against the base commit(s) to determine the
466/// modified files that need to be `jj fix`ed. We also use these base commits to
467/// compute modified lines by diffing the file content in the current commit
468/// against the file content in the base commit(s).
469///
470/// This is public only for testing purposes.
471pub async fn get_base_commit_map(
472 commits: &[Commit],
473) -> Result<HashMap<CommitId, IndexSet<CommitId>>, FixError> {
474 let commit_ids: HashSet<&CommitId> = commits.iter().map(|c| c.id()).collect();
475 let parents_lists = try_join_all(commits.iter().map(|c| c.parents())).await?;
476 let base_commit_ids: HashSet<CommitId> = parents_lists
477 .into_iter()
478 .flatten()
479 .filter(|parent| !commit_ids.contains(parent.id()))
480 .map(|base_commit| base_commit.id().clone())
481 .collect();
482
483 // Build a map of each commit to its "base commits" (closest ancestors not in
484 // `commits`).
485 //
486 // We process commits in topological order (parents before children) so that
487 // we can propagate the base commits from parents to children. Note that the
488 // `commits` vector is in reverse topological order, so we iterate in reverse.
489 let mut base_commit_map: HashMap<CommitId, IndexSet<CommitId>> = HashMap::new();
490 for commit in commits.iter().rev() {
491 let mut parent_commit_ids: IndexSet<CommitId> = IndexSet::new();
492
493 for parent_id in commit.parent_ids() {
494 if let Some(parent_bases) = base_commit_map.get(parent_id) {
495 parent_commit_ids.extend(parent_bases.iter().cloned());
496 }
497 if base_commit_ids.contains(parent_id) {
498 parent_commit_ids.insert(parent_id.clone());
499 }
500 }
501 base_commit_map.insert(commit.id().clone(), parent_commit_ids);
502 }
503
504 Ok(base_commit_map)
505}
506
507#[cfg(test)]
508mod tests {
509 use super::*;
510
511 #[test]
512 fn test_compute_file_line_count() {
513 assert_eq!(compute_file_line_count(b""), 0);
514 assert_eq!(compute_file_line_count(b"a"), 1);
515 assert_eq!(compute_file_line_count(b"a\n"), 1);
516 assert_eq!(compute_file_line_count(b"a\nb"), 2);
517 assert_eq!(compute_file_line_count(b"a\nb\n"), 2);
518 }
519}