imara_diff/myers/
preprocess.rs

1use crate::intern::Token;
2use crate::myers::sqrt;
3
4pub fn preprocess<'a>(
5    before: &[Token],
6    after: &[Token],
7    removed: &'a mut [bool],
8    added: &'a mut [bool],
9) -> (PreprocessedFile, PreprocessedFile) {
10    let (occurances_before, occurances_after) = token_occurrences(before, after);
11    let file1 = PreprocessedFile::new(&occurances_before, before, removed);
12    let file2 = PreprocessedFile::new(&occurances_after, after, added);
13    (file1, file2)
14}
15
16fn token_occurrences(file1: &[Token], file2: &[Token]) -> (Vec<Occurrences>, Vec<Occurrences>) {
17    const MAX_EQLIMIT: u32 = 1024;
18
19    // compute the limit after which tokens are treated as `Occurrences::COMMON`
20    let eqlimit1 = sqrt(file1.len()).min(MAX_EQLIMIT);
21    let eqlimit2 = sqrt(file2.len()).min(MAX_EQLIMIT);
22
23    // first collect how often each token occurs in a file
24    let mut occurrences1 = Vec::new();
25    for token in file1 {
26        let bucket = token.0 as usize;
27        if bucket >= occurrences1.len() {
28            occurrences1.resize(bucket + 1, 0u32);
29        }
30        occurrences1[bucket] += 1;
31    }
32
33    // do the same thing for
34    let mut occurrences2 = Vec::new();
35    let token_occurrences2: Vec<_> = file2
36        .iter()
37        .map(|token| {
38            let bucket = token.0 as usize;
39            if bucket >= occurrences2.len() {
40                occurrences2.resize(bucket + 1, 0);
41            }
42            occurrences2[bucket] += 1;
43            let occurrences1 = *occurrences1.get(bucket).unwrap_or(&0);
44            Occurrences::from_occurrences(occurrences1, eqlimit2)
45        })
46        .collect();
47
48    let token_occurrences1: Vec<_> = file1
49        .iter()
50        .map(|token| {
51            let bucket = token.0 as usize;
52            let occurrences2 = *occurrences2.get(bucket).unwrap_or(&0);
53            Occurrences::from_occurrences(occurrences2, eqlimit1)
54        })
55        .collect();
56
57    (token_occurrences1, token_occurrences2)
58}
59
60#[derive(Clone, Copy, Debug)]
61enum Occurrences {
62    /// Token does not occur in this file
63    None,
64    /// Token occurs at least once
65    Some,
66    /// Token occurs very frequently (exact number depends on file size).
67    /// Such tokens are usually empty lines or braces and are often not meaningful to a diff
68    Common,
69}
70
71impl Occurrences {
72    pub fn from_occurrences(occurrences: u32, eqlimit: u32) -> Occurrences {
73        if occurrences == 0 {
74            Occurrences::None
75        } else if occurrences >= eqlimit {
76            Occurrences::Common
77        } else {
78            Occurrences::Some
79        }
80    }
81}
82
83#[derive(Debug)]
84pub struct PreprocessedFile {
85    pub indices: Vec<u32>,
86    pub tokens: Vec<Token>,
87}
88
89impl PreprocessedFile {
90    fn new(
91        token_occurrences: &[Occurrences],
92        tokens: &[Token],
93        changed: &mut [bool],
94    ) -> PreprocessedFile {
95        let (tokens, indices) = prune_unmatched_tokens(tokens, token_occurrences, changed);
96        PreprocessedFile { indices, tokens }
97    }
98}
99
100fn prune_unmatched_tokens(
101    file: &[Token],
102    token_status: &[Occurrences],
103    changed: &mut [bool],
104) -> (Vec<Token>, Vec<u32>) {
105    assert_eq!(token_status.len(), file.len());
106    file.iter()
107        .zip(token_status)
108        .enumerate()
109        .filter_map(|(i, (&token, &status))| {
110            let prune = match status {
111                Occurrences::None => true,
112                Occurrences::Some => false,
113                Occurrences::Common => should_prune_common_line(token_status, i),
114            };
115            if prune {
116                changed[i] = true;
117                None
118            } else {
119                Some((token, i as u32))
120            }
121        })
122        .unzip()
123}
124
125// TODO do not unnecessarily rescan lines
126fn should_prune_common_line(token_status: &[Occurrences], pos: usize) -> bool {
127    const WINDOW_SIZE: usize = 100;
128
129    let mut unmatched_before = 0;
130    let mut common_before = 0;
131
132    let start = if pos > WINDOW_SIZE { WINDOW_SIZE } else { 0 };
133    for status in token_status[start..pos].iter().rev() {
134        match status {
135            Occurrences::None => {
136                unmatched_before += 1;
137            }
138            Occurrences::Common => {
139                common_before += 1;
140            }
141            Occurrences::Some => break,
142        }
143    }
144
145    if unmatched_before == 0 {
146        return false;
147    }
148
149    let end = token_status.len().min(pos + WINDOW_SIZE);
150    let mut unmatched_after = 0;
151    let mut common_after = 0;
152    for status in token_status[pos..end].iter() {
153        match status {
154            Occurrences::None => {
155                unmatched_after += 1;
156            }
157            Occurrences::Common => {
158                common_after += 1;
159            }
160            Occurrences::Some => break,
161        }
162    }
163
164    if unmatched_after == 0 {
165        return false;
166    }
167
168    let common = common_before + common_after;
169    let unmatched = unmatched_before + unmatched_after;
170
171    unmatched > 3 * common
172}