jj-lib 0.41.0

Library for Jujutsu - an experimental version control system
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
// Copyright 2024 The Jujutsu Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//! API for transforming file content, for example to apply formatting, and
//! propagate those changes across revisions.

use std::collections::HashMap;
use std::collections::HashSet;
use std::sync::mpsc::channel;

use futures::StreamExt as _;
use futures::TryStreamExt as _;
use futures::future::try_join_all;
use indexmap::IndexSet;
use jj_lib::backend::BackendError;
use jj_lib::backend::CommitId;
use jj_lib::backend::FileId;
use jj_lib::backend::TreeValue;
use jj_lib::commit::Commit;
use jj_lib::diff::ContentDiff;
use jj_lib::diff::DiffHunkKind;
use jj_lib::matchers::Matcher;
use jj_lib::merged_tree::TreeDiffEntry;
use jj_lib::merged_tree_builder::MergedTreeBuilder;
use jj_lib::repo::MutableRepo;
use jj_lib::repo::Repo as _;
use jj_lib::repo_path::RepoPathBuf;
use jj_lib::revset::RevsetExpression;
use jj_lib::rewrite::merge_commit_trees;
use jj_lib::store::Store;
use rayon::iter::IntoParallelIterator as _;
use rayon::prelude::ParallelIterator as _;

use crate::revset::RevsetEvaluationError;
use crate::revset::RevsetStreamExt as _;

/// Represents a file whose content may be transformed by a FileFixer.
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
pub struct FileToFix {
    /// Unique identifier for the file content.
    pub file_id: FileId,

    /// The base FileId for the file content. We will use this FileId to
    /// create the diff between the base commit's file content and the current
    /// commit's file content before the fix.
    pub base_file_id: Option<FileId>,

    /// The path is provided to allow the FileFixer to potentially:
    ///  - Choose different behaviors for different file names, extensions, etc.
    ///  - Update parts of the file's content that should be derived from the
    ///    file's path.
    pub repo_path: RepoPathBuf,
}

/// Error fixing files.
#[derive(Debug, thiserror::Error)]
pub enum FixError {
    /// Error while contacting the Backend.
    #[error(transparent)]
    Backend(#[from] BackendError),
    /// Error resolving commit ancestry.
    #[error(transparent)]
    RevsetEvaluation(#[from] RevsetEvaluationError),
    /// Error occurred while reading/writing file content.
    #[error(transparent)]
    Io(#[from] std::io::Error),
    /// Error occurred while processing the file content.
    #[error(transparent)]
    FixContent(Box<dyn std::error::Error + Send + Sync>),
}

/// Fixes a set of files.
///
/// Fixing a file is implementation dependent. For example it may format source
/// code using a code formatter.
pub trait FileFixer {
    /// Fixes a set of files. Stores the resulting file content (for modified
    /// files).
    ///
    /// Returns a map describing the subset of `files_to_fix` that resulted in
    /// changed file content (unchanged files should not be present in the map),
    /// pointing to the new FileId for the file.
    ///
    /// TODO: Better error handling so we can tell the user what went wrong with
    /// each failed input.
    fn fix_files<'a>(
        &mut self,
        store: &Store,
        files_to_fix: &'a HashSet<FileToFix>,
    ) -> Result<HashMap<&'a FileToFix, FileId>, FixError>;
}

/// Aggregate information about the outcome of the file fixer.
#[derive(Debug, Default)]
pub struct FixSummary {
    /// The commits that were rewritten. Maps old commit id to new commit id.
    pub rewrites: HashMap<CommitId, CommitId>,

    /// The number of commits that had files that were passed to the file fixer.
    pub num_checked_commits: i32,
    /// The number of new commits created due to file content changed by the
    /// fixer.
    pub num_fixed_commits: i32,
}

/// A [FileFixer] that applies fix_fn to each file, in parallel.
///
/// The implementation is currently based on [rayon].
// TODO: Consider switching to futures, or document the decision not to. We
// don't need threads unless the threads will be doing more than waiting for
// pipes.
pub struct ParallelFileFixer<T> {
    fix_fn: T,
}

impl<T> ParallelFileFixer<T>
where
    T: Fn(&Store, &FileToFix) -> Result<Option<FileId>, FixError> + Sync + Send,
{
    /// Creates a ParallelFileFixer.
    pub fn new(fix_fn: T) -> Self {
        Self { fix_fn }
    }
}

impl<T> FileFixer for ParallelFileFixer<T>
where
    T: Fn(&Store, &FileToFix) -> Result<Option<FileId>, FixError> + Sync + Send,
{
    /// Applies `fix_fn()` to the inputs and stores the resulting file content.
    fn fix_files<'a>(
        &mut self,
        store: &Store,
        files_to_fix: &'a HashSet<FileToFix>,
    ) -> Result<HashMap<&'a FileToFix, FileId>, FixError> {
        let (updates_tx, updates_rx) = channel();
        files_to_fix.into_par_iter().try_for_each_init(
            || updates_tx.clone(),
            |updates_tx, file_to_fix| -> Result<(), FixError> {
                let result = (self.fix_fn)(store, file_to_fix)?;
                match result {
                    Some(new_file_id) => {
                        updates_tx.send((file_to_fix, new_file_id)).unwrap();
                        Ok(())
                    }
                    None => Ok(()),
                }
            },
        )?;
        drop(updates_tx);
        let mut result = HashMap::new();
        while let Ok((file_to_fix, new_file_id)) = updates_rx.recv() {
            result.insert(file_to_fix, new_file_id);
        }
        Ok(result)
    }
}

/// Updates files with formatting fixes or other changes, using the given
/// FileFixer.
///
/// The primary use case is to apply the results of automatic code formatting
/// tools to revisions that may not be properly formatted yet. It can also be
/// used to modify files with other tools like `sed` or `sort`.
///
/// After the FileFixer is done, descendants are also updated, which ensures
/// that the fixes are not lost. This will never result in new conflicts. Files
/// with existing conflicts are updated on all sides of the conflict, which
/// can potentially increase or decrease the number of conflict markers.
pub async fn fix_files(
    root_commits: Vec<CommitId>,
    matcher: &dyn Matcher,
    include_unchanged_files: bool,
    repo_mut: &mut MutableRepo,
    file_fixer: &mut impl FileFixer,
) -> Result<FixSummary, FixError> {
    let mut summary = FixSummary::default();

    // Collect all of the unique `FileToFix`s we're going to use. file_fixer should
    // be deterministic, and should not consider outside information, so it is
    // safe to deduplicate inputs that correspond to multiple files or commits.
    // This is typically more efficient, but it does prevent certain use cases
    // like providing commit IDs as inputs to be inserted into files. We also
    // need to record the mapping between files-to-fix and paths/commits, to
    // efficiently rewrite the commits later.
    //
    // If a path is being fixed in a particular commit, it must also be fixed in all
    // that commit's descendants. We do this as a way of propagating changes,
    // under the assumption that it is more useful than performing a rebase and
    // risking merge conflicts. In the case of code formatters, rebasing wouldn't
    // reliably produce well formatted code anyway. Deduplicating inputs helps
    // to prevent quadratic growth in the number of tool executions required for
    // doing this in long chains of commits with disjoint sets of modified files.
    let commits: Vec<_> = RevsetExpression::commits(root_commits.clone())
        .descendants()
        .evaluate(repo_mut)?
        .stream()
        .commits(repo_mut.store())
        .try_collect()
        .await?;
    tracing::debug!(
        ?root_commits,
        ?commits,
        "looking for files to fix in commits:"
    );

    // Determine the base commit(s) for each commit.
    let base_commit_map = get_base_commit_map(&commits).await?;

    // Maps repo paths in a commit to the base FileId for that path.
    // Even if a file is conflicted in the current commit (having multiple
    // FileIds at the same path), all sides share the same base FileId (derived
    // from the first side of the base), so we don't need the current FileId in
    // the key.
    let mut base_files: HashMap<(CommitId, RepoPathBuf), FileId> = HashMap::new();

    let mut unique_files_to_fix: HashSet<FileToFix> = HashSet::new();
    let mut commit_paths: HashMap<CommitId, HashSet<RepoPathBuf>> = HashMap::new();
    for commit in commits.iter().rev() {
        let mut paths: HashSet<RepoPathBuf> = HashSet::new();

        // Compute the base tree for the current commit.
        let mut base_commits = Vec::new();
        let base_commit_ids = base_commit_map.get(commit.id()).unwrap();
        for base_commit_id in base_commit_ids {
            if let Some(base_paths) = commit_paths.get(base_commit_id) {
                paths.extend(base_paths.iter().cloned());
            }
            let base_commit = repo_mut.store().get_commit_async(base_commit_id).await?;
            base_commits.push(base_commit);
        }
        let base_tree = merge_commit_trees(repo_mut, &base_commits).await?;

        // If --include-unchanged-files, we always fix every matching file in the tree.
        // Otherwise, we fix the matching changed files in this commit, plus any that
        // were fixed in ancestors, so we don't lose those changes. We do this
        // instead of rebasing onto those changes, to avoid merge conflicts.
        let diff_base_tree = if include_unchanged_files {
            &repo_mut.store().empty_merged_tree()
        } else {
            &base_tree
        };

        // TODO: handle copy tracking
        let mut diff_stream = diff_base_tree.diff_stream(&commit.tree(), &matcher);
        while let Some(TreeDiffEntry {
            path: repo_path,
            values,
        }) = diff_stream.next().await
        {
            let values = values?;
            if values.after.is_absent() {
                continue;
            }
            let before = if include_unchanged_files {
                base_tree.path_value(&repo_path).await?.into_iter().next()
            } else {
                values.before.into_iter().next()
            };

            // Deleted files have no file content to fix, and they have no terms in `after`,
            // so we don't add any files-to-fix for them. For conflicted files in the base
            // commit(s), we diff against the first side of the conflict. For conflicted
            // files in the current commit, we add all sides of the conflict to
            // the files-to-fix.
            let before_file_id = if let Some(Some(TreeValue::File {
                id: before_id,
                executable: _,
                copy_id: _,
            })) = before
            {
                base_files.insert((commit.id().clone(), repo_path.clone()), before_id.clone());
                Some(before_id.clone())
            } else {
                None
            };

            for after_term in values.after {
                // We currently only support fixing the content of normal files, so we skip
                // directories and symlinks, and we ignore the executable bit.
                if let Some(TreeValue::File {
                    id,
                    executable: _,
                    copy_id: _,
                }) = after_term
                {
                    // TODO: Skip the file if its content is larger than some configured size,
                    // preferably without actually reading it yet.
                    let file_to_fix = FileToFix {
                        file_id: id.clone(),
                        base_file_id: before_file_id.clone(),
                        repo_path: repo_path.clone(),
                    };
                    unique_files_to_fix.insert(file_to_fix.clone());
                    paths.insert(repo_path.clone());
                }
            }
        }
        commit_paths.insert(commit.id().clone(), paths);
    }

    tracing::debug!(
        ?include_unchanged_files,
        ?unique_files_to_fix,
        "invoking file fixer on these files:"
    );

    // Fix all of the chosen inputs.
    let fixed_file_ids = file_fixer.fix_files(repo_mut.store().as_ref(), &unique_files_to_fix)?;
    tracing::debug!(?fixed_file_ids, "file fixer fixed these files:");

    // Substitute the fixed file IDs into all of the affected commits. Currently,
    // fixes cannot delete or rename files, change the executable bit, or modify
    // other parts of the commit like the description.
    repo_mut
        .transform_descendants(root_commits, async |rewriter| {
            // TODO: Build the trees in parallel before `transform_descendants()` and only
            // keep the tree IDs in memory, so we can pass them to the rewriter.
            let old_commit_id = rewriter.old_commit().id().clone();
            let repo_paths = commit_paths.get(&old_commit_id).unwrap();
            let old_tree = rewriter.old_commit().tree();
            let mut tree_builder = MergedTreeBuilder::new(old_tree.clone());
            let mut has_changes = false;
            for repo_path in repo_paths {
                let old_value = old_tree.path_value(repo_path).await?;
                let base_file_id = base_files.get(&(old_commit_id.clone(), repo_path.clone()));
                let new_value = old_value.map(|old_term| {
                    if let Some(TreeValue::File {
                        id,
                        executable,
                        copy_id,
                    }) = old_term
                    {
                        let file_to_fix = FileToFix {
                            file_id: id.clone(),
                            base_file_id: base_file_id.cloned(),
                            repo_path: repo_path.clone(),
                        };
                        if let Some(new_id) = fixed_file_ids.get(&file_to_fix) {
                            return Some(TreeValue::File {
                                id: new_id.clone(),
                                executable: *executable,
                                copy_id: copy_id.clone(),
                            });
                        }
                    }
                    old_term.clone()
                });
                if new_value != old_value {
                    tree_builder.set_or_remove(repo_path.clone(), new_value);
                    has_changes = true;
                }
            }
            summary.num_checked_commits += 1;
            if has_changes {
                summary.num_fixed_commits += 1;
                let new_tree = tree_builder.write_tree().await?;
                let builder = rewriter.reparent();
                let new_commit = builder.set_tree(new_tree).write().await?;
                summary
                    .rewrites
                    .insert(old_commit_id, new_commit.id().clone());
            } else if rewriter.parents_changed() {
                let new_commit = rewriter.reparent().write().await?;
                summary
                    .rewrites
                    .insert(old_commit_id, new_commit.id().clone());
            }
            Ok(())
        })
        .await?;

    tracing::debug!(?summary);
    Ok(summary)
}

/// Representation of different ranges formatters can use to emit diff ranges.
#[derive(Debug, PartialEq, Eq)]
pub enum RegionsToFormat {
    /// Line ranges (1-based, inclusive [first, last]).
    LineRanges(Vec<LineRange>),
}

/// A formattable range of lines or bytes.
#[derive(Debug, PartialEq, Eq)]
pub struct FormatRange {
    /// The first (inclusive) of the range.
    pub first: usize,
    /// The last (inclusive) of the range.
    pub last: usize,
}

impl FormatRange {
    /// Creates a new `FormatRange`.
    pub fn new(first: usize, last: usize) -> Self {
        Self { first, last }
    }
}

/// A line range (1-based, inclusive [first, last]).
pub type LineRange = FormatRange;

/// Computes the 1-based line ranges in `current` that are different from
/// `base`. The ranges produced can be empty.
pub fn compute_changed_ranges(base: &[u8], current: &[u8]) -> RegionsToFormat {
    let mut ranges: Vec<LineRange> = Vec::new();
    if current.is_empty() {
        return RegionsToFormat::LineRanges(ranges);
    }

    let diff = ContentDiff::by_line([base, current]);
    let mut current_line = 1;
    for hunk in diff.hunks() {
        let line_count = compute_file_line_count(hunk.contents[1]);
        match hunk.kind {
            DiffHunkKind::Matching => {}
            DiffHunkKind::Different => {
                if line_count > 0 {
                    // We want the diff ranges to be 1-based and inclusive [first, last] as this
                    // is what most formatters expect.
                    ranges.push(LineRange {
                        first: current_line,
                        last: current_line + line_count - 1,
                    });
                }
            }
        }
        current_line += line_count;
    }

    RegionsToFormat::LineRanges(ranges)
}

/// Computes the number of lines in a byte slice (i.e. a file).
pub fn compute_file_line_count(text: &[u8]) -> usize {
    let line_count = text.iter().filter(|&&b| b == b'\n').count();
    let extra = if !text.is_empty() && !text.ends_with(b"\n") {
        1
    } else {
        0
    };
    line_count + extra
}

/// Given a vector of commits, determine the base commit(s) for each of the
/// commits in the vector.
///
/// Notes:
/// - `commits` must be sorted in reverse topological order (children before
///   parents).
/// - The returned base commits are the closest ancestors for each commit that
///   are not in `commits`. They may include ancestors of other base commits.
///
/// The current commit will diff against the base commit(s) to determine the
/// modified files that need to be `jj fix`ed. We also use these base commits to
/// compute modified lines by diffing the file content in the current commit
/// against the file content in the base commit(s).
///
/// This is public only for testing purposes.
pub async fn get_base_commit_map(
    commits: &[Commit],
) -> Result<HashMap<CommitId, IndexSet<CommitId>>, FixError> {
    let commit_ids: HashSet<&CommitId> = commits.iter().map(|c| c.id()).collect();
    let parents_lists = try_join_all(commits.iter().map(|c| c.parents())).await?;
    let base_commit_ids: HashSet<CommitId> = parents_lists
        .into_iter()
        .flatten()
        .filter(|parent| !commit_ids.contains(parent.id()))
        .map(|base_commit| base_commit.id().clone())
        .collect();

    // Build a map of each commit to its "base commits" (closest ancestors not in
    // `commits`).
    //
    // We process commits in topological order (parents before children) so that
    // we can propagate the base commits from parents to children. Note that the
    // `commits` vector is in reverse topological order, so we iterate in reverse.
    let mut base_commit_map: HashMap<CommitId, IndexSet<CommitId>> = HashMap::new();
    for commit in commits.iter().rev() {
        let mut parent_commit_ids: IndexSet<CommitId> = IndexSet::new();

        for parent_id in commit.parent_ids() {
            if let Some(parent_bases) = base_commit_map.get(parent_id) {
                parent_commit_ids.extend(parent_bases.iter().cloned());
            }
            if base_commit_ids.contains(parent_id) {
                parent_commit_ids.insert(parent_id.clone());
            }
        }
        base_commit_map.insert(commit.id().clone(), parent_commit_ids);
    }

    Ok(base_commit_map)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_compute_file_line_count() {
        assert_eq!(compute_file_line_count(b""), 0);
        assert_eq!(compute_file_line_count(b"a"), 1);
        assert_eq!(compute_file_line_count(b"a\n"), 1);
        assert_eq!(compute_file_line_count(b"a\nb"), 2);
        assert_eq!(compute_file_line_count(b"a\nb\n"), 2);
    }
}