Skip to main content

gix_imara_diff/
slider_heuristic.rs

1// Modified for gitoxide from the upstream imara-diff crate.
2// Upstream source: git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/slider_heuristic.rs
3
4use std::cmp::Ordering;
5use std::hash::Hash;
6use std::ops::{Add, Range};
7
8use crate::intern::Token;
9
10/// A trait for heuristics that determine the best position for ambiguous diff hunks.
11///
12/// During postprocessing, some hunks can be moved up or down without changing the
13/// minimal nature of the diff. This trait allows customizing the logic for choosing
14/// the optimal position for such hunks.
15pub trait SliderHeuristic {
16    /// Determines the best ending position for a hunk that can be slid.
17    ///
18    /// # Parameters
19    ///
20    /// * `tokens` - The token sequence being diffed
21    /// * `hunk` - The range representing the current hunk position
22    /// * `earliest_end` - The earliest valid ending position for the hunk
23    ///
24    /// # Returns
25    ///
26    /// The preferred ending position for the hunk
27    fn best_slider_end(&mut self, tokens: &[Token], hunk: Range<u32>, earliest_end: u32) -> u32;
28}
29
30impl<F> SliderHeuristic for F
31where
32    F: FnMut(&[Token], Range<u32>, u32) -> u32,
33{
34    fn best_slider_end(&mut self, tokens: &[Token], hunk: Range<u32>, earliest_end: u32) -> u32 {
35        self(tokens, hunk, earliest_end)
36    }
37}
38
39/// A slider heuristic that doesn't adjust hunk positions.
40///
41/// This heuristic always places hunks at their lowest possible position without
42/// applying any additional logic.
43pub struct NoSliderHeuristic;
44
45impl SliderHeuristic for NoSliderHeuristic {
46    fn best_slider_end(&mut self, _tokens: &[Token], hunk: Range<u32>, _earliest_end: u32) -> u32 {
47        hunk.end
48    }
49}
50
51/// A slider heuristic that uses indentation levels to determine the best hunk position.
52///
53/// This heuristic analyzes the indentation of lines surrounding potential hunk positions
54/// and chooses the position that results in the most intuitive diff for human readers.
55/// It's particularly effective for code and other indented text.
56pub struct IndentHeuristic<IndentOfToken> {
57    /// A function that computes the indentation level for a given token.
58    indent_of_token: IndentOfToken,
59}
60
61impl<IndentOfToken> IndentHeuristic<IndentOfToken> {
62    /// Creates a new `IndentHeuristic` with the given indentation function.
63    ///
64    /// # Parameters
65    ///
66    /// * `indent_of_token` - A function that takes a token and returns its indentation level
67    pub fn new(indent_of_token: IndentOfToken) -> Self {
68        Self { indent_of_token }
69    }
70}
71
72impl<IndentOfToken: Fn(Token) -> IndentLevel> SliderHeuristic for IndentHeuristic<IndentOfToken> {
73    fn best_slider_end(&mut self, tokens: &[Token], hunk: Range<u32>, earliest_end: u32) -> u32 {
74        const MAX_SLIDING: u32 = 100;
75        // This is a pure insertion that can be moved freely up and down.
76        // To get more intuitive results, apply a heuristic.
77        let mut top_slider_end = earliest_end;
78        // TODO: why is this needed
79        if top_slider_end < hunk.start - 1 {
80            top_slider_end = hunk.start - 1;
81        }
82        if hunk.end > top_slider_end + MAX_SLIDING {
83            top_slider_end = hunk.end - MAX_SLIDING;
84        }
85        let group_size = hunk.end - hunk.start;
86        let mut best_score = Score::for_range(
87            top_slider_end - group_size..top_slider_end,
88            tokens,
89            &self.indent_of_token,
90        );
91        let mut best_slider_end = top_slider_end;
92        for slider_end in (top_slider_end + 1)..=hunk.end {
93            let score = Score::for_range(slider_end - group_size..slider_end, tokens, &self.indent_of_token);
94            if score.is_improvement_over(best_score) {
95                best_score = score;
96                best_slider_end = slider_end;
97            }
98        }
99        best_slider_end
100    }
101}
102
103/// Represents the indentation level of a line.
104///
105/// Indentation is measured in spaces, with tabs expanded according to a configurable tab width.
106/// Special values are used to represent blank lines and maximum indentation.
107#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash, PartialOrd)]
108pub struct IndentLevel(u8);
109
110impl IndentLevel {
111    /// Represents a line that is empty or contains only whitespace (or EOF).
112    const BLANK: IndentLevel = IndentLevel(u8::MAX);
113    /// The maximum trackable indentation level.
114    const MAX: IndentLevel = IndentLevel(200);
115
116    /// Computes the indentation level for an ASCII line.
117    ///
118    /// # Parameters
119    ///
120    /// * `src` - An iterator over the bytes of the line
121    /// * `tab_width` - The number of spaces that a tab character represents (min is 1)
122    ///
123    /// # Returns
124    ///
125    /// The computed indentation level, or `BLANK` if the line contains only whitespace
126    pub fn for_ascii_line(src: impl IntoIterator<Item = u8>, tab_width: u8) -> IndentLevel {
127        let mut indent_level = IndentLevel(0);
128        let tab_width = tab_width.max(1);
129        for c in src {
130            match c {
131                b' ' => indent_level = indent_level.increased_by(1),
132                b'\t' => indent_level = indent_level.increased_by(tab_width - indent_level.0 % tab_width),
133                b'\r' | b'\n' | b'\x0C' => (),
134                _ => return indent_level,
135            }
136            if indent_level >= Self::MAX {
137                return Self::MAX;
138            }
139        }
140        IndentLevel::BLANK
141    }
142
143    /// Computes the indentation level for a Unicode line.
144    ///
145    /// # Parameters
146    ///
147    /// * `src` - An iterator over the characters of the line
148    /// * `tab_width` - The number of spaces that a tab character represents
149    ///
150    /// # Returns
151    ///
152    /// The computed indentation level, or `BLANK` if the line contains only whitespace
153    pub fn for_line(src: impl IntoIterator<Item = char>, tab_width: u8) -> IndentLevel {
154        let mut indent_level = IndentLevel(0);
155        let tab_width = tab_width.max(1);
156        for c in src {
157            match c {
158                ' ' => indent_level = indent_level.increased_by(1),
159                '\t' => indent_level = indent_level.increased_by(tab_width - indent_level.0 % tab_width),
160                '\r' | '\n' | '\x0C' => (),
161                _ => return indent_level,
162            }
163            if indent_level >= Self::MAX {
164                return Self::MAX;
165            }
166        }
167        IndentLevel::BLANK
168    }
169
170    fn increased_by(self, amount: u8) -> Self {
171        IndentLevel(self.0.saturating_add(amount).min(Self::MAX.0))
172    }
173
174    fn map_or<T>(self, default: T, f: impl FnOnce(u8) -> T) -> T {
175        if self == Self::BLANK {
176            default
177        } else {
178            f(self.0)
179        }
180    }
181
182    fn or(self, default: Self) -> Self {
183        if self == Self::BLANK {
184            default
185        } else {
186            self
187        }
188    }
189}
190
191/// Captures indentation information for a token and its surrounding context.
192///
193/// This structure is used by the indent heuristic to evaluate different hunk positions.
194#[derive(Debug, Clone, Copy, PartialEq, Eq)]
195pub struct Indents {
196    /// Indentation level of the current line/token.
197    indent: IndentLevel,
198    /// Indentation level of the previous non-blank line.
199    prev_indent: IndentLevel,
200    /// Indentation level of the next non-blank line.
201    next_indent: IndentLevel,
202    /// The number of consecutive blank lines above the current position.
203    leading_blanks: u8,
204    /// The number of blank lines after the line following the current position.
205    trailing_blanks: u8,
206}
207
208/// Maximum number of consecutive blank lines to consider when computing indentation context.
209const MAX_BLANKS: usize = 20;
210
211impl Indents {
212    fn at_token(tokens: &[Token], token_idx: usize, indent_of_token: impl Fn(Token) -> IndentLevel) -> Indents {
213        let (leading_blank_lines, indent_previous_line) = tokens[..token_idx]
214            .iter()
215            .rev()
216            .enumerate()
217            .find_map(|(i, &token)| {
218                if i == MAX_BLANKS {
219                    Some((i, IndentLevel(0)))
220                } else {
221                    let level = indent_of_token(token);
222                    if level == IndentLevel::BLANK {
223                        None
224                    } else {
225                        Some((i, level))
226                    }
227                }
228            })
229            .unwrap_or((token_idx, IndentLevel::BLANK));
230        let at_eof = token_idx == tokens.len();
231        let (trailing_blank_lines, indent_next_line) = if at_eof {
232            (0, IndentLevel::BLANK)
233        } else {
234            tokens[token_idx + 1..]
235                .iter()
236                .enumerate()
237                .find_map(|(i, &token)| {
238                    if i == MAX_BLANKS {
239                        Some((i, IndentLevel(0)))
240                    } else {
241                        let level = indent_of_token(token);
242                        if level == IndentLevel::BLANK {
243                            None
244                        } else {
245                            Some((i, level))
246                        }
247                    }
248                })
249                .unwrap_or((token_idx, IndentLevel::BLANK))
250        };
251        let indent = tokens
252            .get(token_idx)
253            .map_or(IndentLevel::BLANK, |&token| indent_of_token(token));
254        Indents {
255            indent,
256            prev_indent: indent_previous_line,
257            next_indent: indent_next_line,
258            leading_blanks: leading_blank_lines as u8,
259            trailing_blanks: trailing_blank_lines as u8,
260        }
261    }
262
263    fn score(&self) -> Score {
264        let mut penalty = 0;
265        if self.prev_indent == IndentLevel::BLANK && self.leading_blanks == 0 {
266            penalty += START_OF_FILE_PENALTY;
267        }
268        if self.next_indent == IndentLevel::BLANK && self.trailing_blanks == 0 {
269            penalty += END_OF_FILE_PENALTY;
270        }
271
272        let trailing_blank_lines = if self.indent == IndentLevel::BLANK {
273            self.trailing_blanks as i32 + 1
274        } else {
275            0
276        };
277        let total_blank_lines = trailing_blank_lines + self.leading_blanks as i32;
278        penalty += TOTAL_BLANK_LINE_WEIGHT * total_blank_lines + trailing_blank_lines * TRAILING_BLANK_LINES_WEIGHT;
279        let indent = self.indent.or(self.next_indent);
280        if indent != IndentLevel::BLANK && self.prev_indent != IndentLevel::BLANK {
281            match indent.0.cmp(&self.prev_indent.0) {
282                Ordering::Equal => {}
283                // self.next_indent != IndentLevel::BLANK follows for free here
284                // since indent != BLANK and therefore self.next_indent <= indent < BLANK
285                Ordering::Less if self.next_indent.0 <= indent.0 => {
286                    penalty += if total_blank_lines != 0 {
287                        RELATIVE_DEDENT_WITH_BLANK_PENALTY
288                    } else {
289                        RELATIVE_DEDENT_PENALTY
290                    }
291                }
292                Ordering::Less => {
293                    penalty += if total_blank_lines != 0 {
294                        RELATIVE_OUTDENT_WITH_BLANK_PENALTY
295                    } else {
296                        RELATIVE_OUTDENT_PENALTY
297                    }
298                }
299                Ordering::Greater => {
300                    penalty += if total_blank_lines != 0 {
301                        RELATIVE_INDENT_WITH_BLANK_PENALTY
302                    } else {
303                        RELATIVE_INDENT_PENALTY
304                    }
305                }
306            }
307        }
308        Score {
309            indent: indent.map_or(-1, i32::from),
310            penalty,
311        }
312    }
313}
314
315/// Penalty for placing a hunk at the start of a file.
316const START_OF_FILE_PENALTY: i32 = 1;
317/// Penalty for placing a hunk at the end of a file.
318const END_OF_FILE_PENALTY: i32 = 21;
319/// Weight applied to the total number of blank lines surrounding a hunk (negative means preferred).
320const TOTAL_BLANK_LINE_WEIGHT: i32 = -30;
321/// Additional weight for trailing blank lines.
322const TRAILING_BLANK_LINES_WEIGHT: i32 = 6;
323
324/// Penalty for placing a hunk where indentation increases (negative means preferred).
325const RELATIVE_INDENT_PENALTY: i32 = -4;
326/// Penalty for placing a hunk where indentation increases with blank lines present.
327const RELATIVE_INDENT_WITH_BLANK_PENALTY: i32 = 10;
328
329/// Penalty for placing a hunk where indentation decreases (outdent).
330const RELATIVE_OUTDENT_PENALTY: i32 = 24;
331/// Penalty for placing a hunk where indentation decreases with blank lines present.
332const RELATIVE_OUTDENT_WITH_BLANK_PENALTY: i32 = 17;
333
334/// Penalty for placing a hunk where indentation decreases but stays aligned (dedent).
335const RELATIVE_DEDENT_PENALTY: i32 = 23;
336/// Penalty for placing a hunk where indentation decreases but stays aligned with blank lines present.
337const RELATIVE_DEDENT_WITH_BLANK_PENALTY: i32 = 17;
338
339/// Weight factor for comparing indentation levels when scoring positions.
340const INDENT_WEIGHT: i32 = 60;
341
342/// A score for evaluating the quality of a hunk position.
343///
344/// Lower scores are better. The score considers both indentation level
345/// and various penalties based on the surrounding context.
346#[derive(PartialEq, Eq, Clone, Copy)]
347struct Score {
348    /// The combined indentation level at the hunk boundaries.
349    indent: i32,
350    /// The total penalty from various heuristics.
351    penalty: i32,
352}
353
354impl Score {
355    fn for_range(range: Range<u32>, tokens: &[Token], indent_of_token: impl Fn(Token) -> IndentLevel) -> Score {
356        Indents::at_token(tokens, range.start as usize, &indent_of_token).score()
357            + Indents::at_token(tokens, range.end as usize, &indent_of_token).score()
358    }
359}
360
361impl Add for Score {
362    type Output = Score;
363
364    fn add(self, rhs: Self) -> Self::Output {
365        Score {
366            indent: self.indent + rhs.indent,
367            penalty: self.penalty + rhs.penalty,
368        }
369    }
370}
371
372impl Score {
373    fn is_improvement_over(self, prev_score: Self) -> bool {
374        // smaller indentation level is preferred (with a weight)
375        let indent_score = match prev_score.indent.cmp(&self.indent) {
376            Ordering::Less => INDENT_WEIGHT,
377            Ordering::Greater => -INDENT_WEIGHT,
378            Ordering::Equal => 0,
379        };
380        (indent_score + self.penalty - prev_score.penalty) <= 0
381    }
382}
383
384#[cfg(test)]
385mod tests {
386    use super::IndentLevel;
387
388    #[test]
389    fn ascii_indent_clamps_before_overflow() {
390        assert_eq!(
391            IndentLevel::for_ascii_line(std::iter::repeat(b' ').take(255), 1),
392            IndentLevel::MAX
393        );
394        assert_eq!(
395            IndentLevel::for_ascii_line(std::iter::repeat(b'\t').take(8), u8::MAX),
396            IndentLevel::MAX
397        );
398    }
399
400    #[test]
401    fn unicode_indent_treats_zero_tab_width_as_one() {
402        assert_eq!(IndentLevel::for_line(['\t', 'x'], 0), IndentLevel(1));
403    }
404}