Skip to main content

gix_imara_diff/
slider_heuristic.rs

1// Modified for gitoxide from the upstream imara-diff crate.
2// Upstream source: git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/slider_heuristic.rs
3
4use std::cmp::Ordering;
5use std::hash::Hash;
6use std::ops::{Add, Range};
7
8use crate::intern::Token;
9
10/// A trait for heuristics that determine the best position for ambiguous diff hunks.
11///
12/// During postprocessing, some hunks can be moved up or down without changing the
13/// minimal nature of the diff. This trait allows customizing the logic for choosing
14/// the optimal position for such hunks.
15pub trait SliderHeuristic {
16    /// Determines the best ending position for a hunk that can be slid.
17    ///
18    /// # Parameters
19    ///
20    /// * `tokens` - The token sequence being diffed
21    /// * `hunk` - The range representing the current hunk position
22    /// * `earliest_end` - The earliest valid ending position for the hunk
23    ///
24    /// # Returns
25    ///
26    /// The preferred ending position for the hunk
27    fn best_slider_end(&mut self, tokens: &[Token], hunk: Range<u32>, earliest_end: u32) -> u32;
28}
29
30impl<F> SliderHeuristic for F
31where
32    F: FnMut(&[Token], Range<u32>, u32) -> u32,
33{
34    fn best_slider_end(&mut self, tokens: &[Token], hunk: Range<u32>, earliest_end: u32) -> u32 {
35        self(tokens, hunk, earliest_end)
36    }
37}
38
39/// A slider heuristic that doesn't adjust hunk positions.
40///
41/// This heuristic always places hunks at their lowest possible position without
42/// applying any additional logic.
43pub struct NoSliderHeuristic;
44
45impl SliderHeuristic for NoSliderHeuristic {
46    fn best_slider_end(&mut self, _tokens: &[Token], hunk: Range<u32>, _earliest_end: u32) -> u32 {
47        hunk.end
48    }
49}
50
51/// A slider heuristic that uses indentation levels to determine the best hunk position.
52///
53/// This heuristic analyzes the indentation of lines surrounding potential hunk positions
54/// and chooses the position that results in the most intuitive diff for human readers.
55/// It's particularly effective for code and other indented text.
56pub struct IndentHeuristic<IndentOfToken> {
57    /// A function that computes the indentation level for a given token.
58    indent_of_token: IndentOfToken,
59}
60
61impl<IndentOfToken> IndentHeuristic<IndentOfToken> {
62    /// Creates a new `IndentHeuristic` with the given indentation function.
63    ///
64    /// # Parameters
65    ///
66    /// * `indent_of_token` - A function that takes a token and returns its indentation level
67    pub fn new(indent_of_token: IndentOfToken) -> Self {
68        Self { indent_of_token }
69    }
70}
71
72impl<IndentOfToken: Fn(Token) -> IndentLevel> SliderHeuristic for IndentHeuristic<IndentOfToken> {
73    fn best_slider_end(&mut self, tokens: &[Token], hunk: Range<u32>, earliest_end: u32) -> u32 {
74        const MAX_SLIDING: u32 = 100;
75        // This is a pure insertion that can be moved freely up and down.
76        // To get more intuitive results, apply a heuristic.
77        let mut top_slider_end = earliest_end;
78        // TODO: why is this needed
79        if top_slider_end < hunk.start - 1 {
80            top_slider_end = hunk.start - 1;
81        }
82        if hunk.end > top_slider_end + MAX_SLIDING {
83            top_slider_end = hunk.end - MAX_SLIDING;
84        }
85        let group_size = hunk.end - hunk.start;
86        let mut best_score = Score::for_range(
87            top_slider_end - group_size..top_slider_end,
88            tokens,
89            &self.indent_of_token,
90        );
91        let mut best_slider_end = top_slider_end;
92        for slider_end in (top_slider_end + 1)..=hunk.end {
93            let score = Score::for_range(slider_end - group_size..slider_end, tokens, &self.indent_of_token);
94            if score.is_improvement_over(best_score) {
95                best_score = score;
96                best_slider_end = slider_end;
97            }
98        }
99        best_slider_end
100    }
101}
102
103/// Represents the indentation level of a line.
104///
105/// Indentation is measured in spaces, with tabs expanded according to a configurable tab width.
106/// Special values are used to represent blank lines and maximum indentation.
107#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash, PartialOrd)]
108pub struct IndentLevel(u8);
109
110impl IndentLevel {
111    /// Represents a line that is empty or contains only whitespace (or EOF).
112    const BLANK: IndentLevel = IndentLevel(u8::MAX);
113    /// The maximum trackable indentation level.
114    const MAX: IndentLevel = IndentLevel(200);
115
116    /// Computes the indentation level for an ASCII line.
117    ///
118    /// # Parameters
119    ///
120    /// * `src` - An iterator over the bytes of the line
121    /// * `tab_width` - The number of spaces that a tab character represents (min is 1)
122    ///
123    /// # Returns
124    ///
125    /// The computed indentation level, or `BLANK` if the line contains only whitespace
126    pub fn for_ascii_line(src: impl IntoIterator<Item = u8>, tab_width: u8) -> IndentLevel {
127        let mut indent_level = IndentLevel(0);
128        let tab_width = tab_width.max(1);
129        for c in src {
130            match c {
131                b' ' => indent_level = indent_level.increased_by(1),
132                b'\t' => indent_level = indent_level.increased_by(tab_width - indent_level.0 % tab_width),
133                b'\r' | b'\n' | b'\x0C' => (),
134                _ => return indent_level,
135            }
136            if indent_level >= Self::MAX {
137                return Self::MAX;
138            }
139        }
140        IndentLevel::BLANK
141    }
142
143    /// Computes the indentation level for a Unicode line.
144    ///
145    /// # Parameters
146    ///
147    /// * `src` - An iterator over the characters of the line
148    /// * `tab_width` - The number of spaces that a tab character represents
149    ///
150    /// # Returns
151    ///
152    /// The computed indentation level, or `BLANK` if the line contains only whitespace
153    pub fn for_line(src: impl IntoIterator<Item = char>, tab_width: u8) -> IndentLevel {
154        let mut indent_level = IndentLevel(0);
155        let tab_width = tab_width.max(1);
156        for c in src {
157            match c {
158                ' ' => indent_level = indent_level.increased_by(1),
159                '\t' => indent_level = indent_level.increased_by(tab_width - indent_level.0 % tab_width),
160                '\r' | '\n' | '\x0C' => (),
161                _ => return indent_level,
162            }
163            if indent_level >= Self::MAX {
164                return Self::MAX;
165            }
166        }
167        IndentLevel::BLANK
168    }
169
170    fn increased_by(self, amount: u8) -> Self {
171        IndentLevel(self.0.saturating_add(amount).min(Self::MAX.0))
172    }
173
174    fn map_or<T>(self, default: T, f: impl FnOnce(u8) -> T) -> T {
175        if self == Self::BLANK { default } else { f(self.0) }
176    }
177
178    fn or(self, default: Self) -> Self {
179        if self == Self::BLANK { default } else { self }
180    }
181}
182
183/// Captures indentation information for a token and its surrounding context.
184///
185/// This structure is used by the indent heuristic to evaluate different hunk positions.
186#[derive(Debug, Clone, Copy, PartialEq, Eq)]
187pub struct Indents {
188    /// Indentation level of the current line/token.
189    indent: IndentLevel,
190    /// Indentation level of the previous non-blank line.
191    prev_indent: IndentLevel,
192    /// Indentation level of the next non-blank line.
193    next_indent: IndentLevel,
194    /// The number of consecutive blank lines above the current position.
195    leading_blanks: u8,
196    /// The number of blank lines after the line following the current position.
197    trailing_blanks: u8,
198}
199
200/// Maximum number of consecutive blank lines to consider when computing indentation context.
201const MAX_BLANKS: usize = 20;
202
203impl Indents {
204    fn at_token(tokens: &[Token], token_idx: usize, indent_of_token: impl Fn(Token) -> IndentLevel) -> Indents {
205        let (leading_blank_lines, indent_previous_line) = tokens[..token_idx]
206            .iter()
207            .rev()
208            .enumerate()
209            .find_map(|(i, &token)| {
210                if i == MAX_BLANKS {
211                    Some((i, IndentLevel(0)))
212                } else {
213                    let level = indent_of_token(token);
214                    if level == IndentLevel::BLANK {
215                        None
216                    } else {
217                        Some((i, level))
218                    }
219                }
220            })
221            .unwrap_or((token_idx, IndentLevel::BLANK));
222        let at_eof = token_idx == tokens.len();
223        let (trailing_blank_lines, indent_next_line) = if at_eof {
224            (0, IndentLevel::BLANK)
225        } else {
226            tokens[token_idx + 1..]
227                .iter()
228                .enumerate()
229                .find_map(|(i, &token)| {
230                    if i == MAX_BLANKS {
231                        Some((i, IndentLevel(0)))
232                    } else {
233                        let level = indent_of_token(token);
234                        if level == IndentLevel::BLANK {
235                            None
236                        } else {
237                            Some((i, level))
238                        }
239                    }
240                })
241                .unwrap_or((token_idx, IndentLevel::BLANK))
242        };
243        let indent = tokens
244            .get(token_idx)
245            .map_or(IndentLevel::BLANK, |&token| indent_of_token(token));
246        Indents {
247            indent,
248            prev_indent: indent_previous_line,
249            next_indent: indent_next_line,
250            leading_blanks: leading_blank_lines as u8,
251            trailing_blanks: trailing_blank_lines as u8,
252        }
253    }
254
255    fn score(&self) -> Score {
256        let mut penalty = 0;
257        if self.prev_indent == IndentLevel::BLANK && self.leading_blanks == 0 {
258            penalty += START_OF_FILE_PENALTY;
259        }
260        if self.next_indent == IndentLevel::BLANK && self.trailing_blanks == 0 {
261            penalty += END_OF_FILE_PENALTY;
262        }
263
264        let trailing_blank_lines = if self.indent == IndentLevel::BLANK {
265            self.trailing_blanks as i32 + 1
266        } else {
267            0
268        };
269        let total_blank_lines = trailing_blank_lines + self.leading_blanks as i32;
270        penalty += TOTAL_BLANK_LINE_WEIGHT * total_blank_lines + trailing_blank_lines * TRAILING_BLANK_LINES_WEIGHT;
271        let indent = self.indent.or(self.next_indent);
272        if indent != IndentLevel::BLANK && self.prev_indent != IndentLevel::BLANK {
273            match indent.0.cmp(&self.prev_indent.0) {
274                Ordering::Equal => {}
275                // self.next_indent != IndentLevel::BLANK follows for free here
276                // since indent != BLANK and therefore self.next_indent <= indent < BLANK
277                Ordering::Less if self.next_indent.0 <= indent.0 => {
278                    penalty += if total_blank_lines != 0 {
279                        RELATIVE_DEDENT_WITH_BLANK_PENALTY
280                    } else {
281                        RELATIVE_DEDENT_PENALTY
282                    }
283                }
284                Ordering::Less => {
285                    penalty += if total_blank_lines != 0 {
286                        RELATIVE_OUTDENT_WITH_BLANK_PENALTY
287                    } else {
288                        RELATIVE_OUTDENT_PENALTY
289                    }
290                }
291                Ordering::Greater => {
292                    penalty += if total_blank_lines != 0 {
293                        RELATIVE_INDENT_WITH_BLANK_PENALTY
294                    } else {
295                        RELATIVE_INDENT_PENALTY
296                    }
297                }
298            }
299        }
300        Score {
301            indent: indent.map_or(-1, i32::from),
302            penalty,
303        }
304    }
305}
306
307/// Penalty for placing a hunk at the start of a file.
308const START_OF_FILE_PENALTY: i32 = 1;
309/// Penalty for placing a hunk at the end of a file.
310const END_OF_FILE_PENALTY: i32 = 21;
311/// Weight applied to the total number of blank lines surrounding a hunk (negative means preferred).
312const TOTAL_BLANK_LINE_WEIGHT: i32 = -30;
313/// Additional weight for trailing blank lines.
314const TRAILING_BLANK_LINES_WEIGHT: i32 = 6;
315
316/// Penalty for placing a hunk where indentation increases (negative means preferred).
317const RELATIVE_INDENT_PENALTY: i32 = -4;
318/// Penalty for placing a hunk where indentation increases with blank lines present.
319const RELATIVE_INDENT_WITH_BLANK_PENALTY: i32 = 10;
320
321/// Penalty for placing a hunk where indentation decreases (outdent).
322const RELATIVE_OUTDENT_PENALTY: i32 = 24;
323/// Penalty for placing a hunk where indentation decreases with blank lines present.
324const RELATIVE_OUTDENT_WITH_BLANK_PENALTY: i32 = 17;
325
326/// Penalty for placing a hunk where indentation decreases but stays aligned (dedent).
327const RELATIVE_DEDENT_PENALTY: i32 = 23;
328/// Penalty for placing a hunk where indentation decreases but stays aligned with blank lines present.
329const RELATIVE_DEDENT_WITH_BLANK_PENALTY: i32 = 17;
330
331/// Weight factor for comparing indentation levels when scoring positions.
332const INDENT_WEIGHT: i32 = 60;
333
334/// A score for evaluating the quality of a hunk position.
335///
336/// Lower scores are better. The score considers both indentation level
337/// and various penalties based on the surrounding context.
338#[derive(PartialEq, Eq, Clone, Copy)]
339struct Score {
340    /// The combined indentation level at the hunk boundaries.
341    indent: i32,
342    /// The total penalty from various heuristics.
343    penalty: i32,
344}
345
346impl Score {
347    fn for_range(range: Range<u32>, tokens: &[Token], indent_of_token: impl Fn(Token) -> IndentLevel) -> Score {
348        Indents::at_token(tokens, range.start as usize, &indent_of_token).score()
349            + Indents::at_token(tokens, range.end as usize, &indent_of_token).score()
350    }
351}
352
353impl Add for Score {
354    type Output = Score;
355
356    fn add(self, rhs: Self) -> Self::Output {
357        Score {
358            indent: self.indent + rhs.indent,
359            penalty: self.penalty + rhs.penalty,
360        }
361    }
362}
363
364impl Score {
365    fn is_improvement_over(self, prev_score: Self) -> bool {
366        // smaller indentation level is preferred (with a weight)
367        let indent_score = match prev_score.indent.cmp(&self.indent) {
368            Ordering::Less => INDENT_WEIGHT,
369            Ordering::Greater => -INDENT_WEIGHT,
370            Ordering::Equal => 0,
371        };
372        (indent_score + self.penalty - prev_score.penalty) <= 0
373    }
374}
375
376#[cfg(test)]
377mod tests {
378    use super::IndentLevel;
379
380    #[test]
381    fn ascii_indent_clamps_before_overflow() {
382        assert_eq!(
383            IndentLevel::for_ascii_line(std::iter::repeat_n(b' ', 255), 1),
384            IndentLevel::MAX
385        );
386        assert_eq!(
387            IndentLevel::for_ascii_line(std::iter::repeat_n(b'\t', 8), u8::MAX),
388            IndentLevel::MAX
389        );
390    }
391
392    #[test]
393    fn unicode_indent_treats_zero_tab_width_as_one() {
394        assert_eq!(IndentLevel::for_line(['\t', 'x'], 0), IndentLevel(1));
395    }
396}