gix_diff/blob/mod.rs
1//! For using text diffs, please have a look at the [`imara-diff` documentation](https://docs.rs/imara-diff),
2//! maintained by [Pascal Kuthe](https://github.com/pascalkuthe).
3use std::{collections::HashMap, path::PathBuf};
4
5use bstr::BString;
6pub use imara_diff::*;
7
8/// Re-export imara-diff v0.2 types for use with slider heuristics.
9///
10/// This module provides access to the v0.2 API of imara-diff, which includes
11/// support for Git's slider heuristics to produce more intuitive diffs.
12#[cfg(feature = "blob-experimental")]
13pub use imara_diff_v2 as v2;
14
15/// Compute a diff with Git's slider heuristics to produce more intuitive diffs.
16///
17/// This function uses `imara-diff` v0.2 which provides the [`v2::Diff`] structure
18/// that supports postprocessing with slider heuristics. The slider heuristics move
19/// diff hunks to more intuitive locations based on indentation and other factors,
20/// resulting in diffs that are more readable and match Git's output more closely.
21///
22/// # Examples
23///
24/// ```
25/// use gix_diff::blob::{diff_with_slider_heuristics, v2::{Algorithm, InternedInput}};
26///
27/// let before = "fn foo() {\n let x = 1;\n}\n";
28/// let after = "fn foo() {\n let x = 2;\n}\n";
29///
30/// let input = InternedInput::new(before, after);
31/// let diff = diff_with_slider_heuristics(Algorithm::Histogram, &input);
32///
33/// // The diff now has slider heuristics applied
34/// assert_eq!(diff.count_removals(), 1);
35/// assert_eq!(diff.count_additions(), 1);
36/// ```
37#[cfg(feature = "blob-experimental")]
38pub fn diff_with_slider_heuristics<T: AsRef<[u8]>>(algorithm: v2::Algorithm, input: &v2::InternedInput<T>) -> v2::Diff {
39 let mut diff = v2::Diff::compute(algorithm, input);
40 diff.postprocess_lines(input);
41 diff
42}
43
44///
45pub mod pipeline;
46
47///
48pub mod platform;
49
50pub mod unified_diff;
51pub use unified_diff::impls::UnifiedDiff;
52
53/// Information about the diff performed to detect similarity.
54#[derive(Debug, Default, Clone, Copy, PartialEq, PartialOrd)]
55pub struct DiffLineStats {
56 /// The amount of lines to remove from the source to get to the destination.
57 pub removals: u32,
58 /// The amount of lines to add to the source to get to the destination.
59 pub insertions: u32,
60 /// The amount of lines of the previous state, in the source.
61 pub before: u32,
62 /// The amount of lines of the new state, in the destination.
63 pub after: u32,
64 /// A range from 0 to 1.0, where 1.0 is a perfect match and 0.5 is a similarity of 50%.
65 /// Similarity is the ratio between all lines in the previous blob and the current blob,
66 /// calculated as `(old_lines_count - new_lines_count) as f32 / old_lines_count.max(new_lines_count) as f32`.
67 pub similarity: f32,
68}
69
70/// A way to classify a resource suitable for diffing.
71#[derive(Copy, Clone, Debug, Ord, PartialOrd, Eq, PartialEq, Hash)]
72pub enum ResourceKind {
73 /// The source of a rewrite, rename or copy operation, or generally the old version of a resource.
74 OldOrSource,
75 /// The destination of a rewrite, rename or copy operation, or generally the new version of a resource.
76 NewOrDestination,
77}
78
79/// A set of values to define how to diff something that is associated with it using `git-attributes`, relevant for regular files.
80///
81/// Some values are related to diffing, some are related to conversions.
82#[derive(Default, Debug, Clone, PartialEq, Eq)]
83pub struct Driver {
84 /// The name of the driver, as referred to by `[diff "name"]` in the git configuration.
85 pub name: BString,
86 /// The command to execute to perform the diff entirely like `<command> old-file old-hex old-mode new-file new-hex new-mode`.
87 ///
88 /// Please note that we don't make this call ourselves, but use it to determine that we should not run the our standard
89 /// built-in algorithm but bail instead as the output of such a program isn't standardized.
90 pub command: Option<BString>,
91 /// The per-driver algorithm to use.
92 pub algorithm: Option<Algorithm>,
93 /// The external filter program to call like `<binary_to_text_command> /path/to/blob` which outputs a textual version of the provided
94 /// binary file.
95 /// Note that it's invoked with a shell if arguments are given.
96 /// Further, if present, it will always be executed, whether `is_binary` is set or not.
97 pub binary_to_text_command: Option<BString>,
98 /// `Some(true)` if this driver deals with binary files, which means that a `binary_to_text_command` should be used to convert binary
99 /// into a textual representation.
100 /// Without such a command, anything that is considered binary is not diffed, but only the size of its data is made available.
101 /// If `Some(false)`, it won't be considered binary, and the its data will not be sampled for the null-byte either.
102 /// Leaving it to `None` means binary detection is automatic, and is based on the presence of the `0` byte in the first 8kB of the buffer.
103 pub is_binary: Option<bool>,
104}
105
106/// A conversion pipeline to take an object or path from what's stored in `git` to what can be diffed, while
107/// following the guidance of git-attributes at the respective path to learn if diffing should happen or if
108/// the content is considered binary.
109///
110/// There are two different conversion flows, where the target of the flow is a buffer with diffable content:
111// TODO: update this with information about possible directions.
112///
113/// * `worktree on disk` -> `text conversion`
114/// * `object` -> `worktree-filters` -> `text conversion`
115#[derive(Clone)]
116pub struct Pipeline {
117 /// A way to read data directly from the worktree.
118 pub roots: pipeline::WorktreeRoots,
119 /// A pipeline to convert objects from what's stored in `git` to its worktree version.
120 pub worktree_filter: gix_filter::Pipeline,
121 /// Options affecting the way we read files.
122 pub options: pipeline::Options,
123 /// Drivers to help customize the conversion behaviour depending on the location of items.
124 drivers: Vec<Driver>,
125 /// Pre-configured attributes to obtain additional diff-related information.
126 attrs: gix_filter::attributes::search::Outcome,
127 /// A buffer to manipulate paths
128 path: PathBuf,
129}
130
131/// A utility for performing a diff of two blobs, including flexible conversions, conversion-caching
132/// acquisition of diff information.
133/// Note that this instance will not call external filters as their output can't be known programmatically,
134/// but it allows to prepare their input if the caller wishes to perform this task.
135///
136/// Optimized for NxM lookups with built-in caching.
137#[derive(Clone)]
138pub struct Platform {
139 /// The old version of a diff-able blob, if set.
140 old: Option<platform::CacheKey>,
141 /// The new version of a diff-able blob, if set.
142 new: Option<platform::CacheKey>,
143
144 /// Options to alter how diffs should be performed.
145 pub options: platform::Options,
146 /// A way to convert objects into a diff-able format.
147 pub filter: Pipeline,
148 /// A way to access .gitattributes
149 pub attr_stack: gix_worktree::Stack,
150
151 /// The way we convert resources into diffable states.
152 pub filter_mode: pipeline::Mode,
153 /// A continuously growing cache keeping ready-for-diff blobs by their path in the worktree,
154 /// as that is what affects their final diff-able state.
155 ///
156 /// That way, expensive rewrite-checks with NxM matrix checks would be as fast as possible,
157 /// avoiding duplicate work.
158 diff_cache: HashMap<platform::CacheKey, platform::CacheValue>,
159 /// A list of previously used buffers, ready for re-use.
160 free_list: Vec<Vec<u8>>,
161}
162
163mod impls {
164 use crate::blob::ResourceKind;
165
166 impl std::fmt::Display for ResourceKind {
167 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
168 f.write_str(match self {
169 ResourceKind::OldOrSource => "old",
170 ResourceKind::NewOrDestination => "new",
171 })
172 }
173 }
174}