jujube_lib/
files.rs

1// Copyright 2020 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use diff::slice as diff_slice;
16use std::fmt::{Debug, Error, Formatter};
17
18fn is_word_byte(a: u8) -> bool {
19    a.is_ascii_alphanumeric() || a == b'_'
20}
21
22fn is_same_word(a: u8, b: u8) -> bool {
23    // Don't allow utf-8 code points to be split into separate words
24    (is_word_byte(a) && is_word_byte(b)) || a & 0x80 != 0
25}
26
27fn tokenize(data: &[u8]) -> Vec<Vec<u8>> {
28    // TODO: Fix this code to not be so inefficient, and to allow the word
29    // delimiter to be configured.
30    let mut output = vec![];
31    let mut current = vec![];
32    let mut maybe_prev: Option<u8> = None;
33    for b in data {
34        let b = *b;
35        match maybe_prev {
36            None => current.push(b),
37            Some(prev) => {
38                if is_same_word(prev, b) {
39                    current.push(b);
40                } else {
41                    output.push(current);
42                    current = vec![b];
43                }
44            }
45        }
46        maybe_prev = Some(b);
47    }
48    if !current.is_empty() {
49        output.push(current);
50    }
51    output
52}
53
54#[derive(PartialEq, Eq, Clone, Debug)]
55pub enum DiffHunk {
56    Unmodified(Vec<u8>),
57    Added(Vec<u8>),
58    Removed(Vec<u8>),
59}
60
61#[derive(PartialEq, Eq, Clone, Debug)]
62pub struct DiffLine {
63    pub left_line_number: u32,
64    pub right_line_number: u32,
65    pub has_left_content: bool,
66    pub has_right_content: bool,
67    pub hunks: Vec<DiffHunk>,
68}
69
70impl DiffLine {
71    fn reset_line(&mut self) {
72        self.has_left_content = false;
73        self.has_right_content = false;
74        self.hunks.clear();
75    }
76
77    pub fn is_unmodified(&self) -> bool {
78        self.hunks
79            .iter()
80            .all(|hunk| matches!(hunk, DiffHunk::Unmodified(_)))
81    }
82}
83
84pub fn diff(left: &[u8], right: &[u8], callback: &mut impl FnMut(&DiffLine)) {
85    // TODO: Should we attempt to interpret as utf-8 and otherwise break only at
86    // newlines?
87    let left_tokens = tokenize(left);
88    let right_tokens = tokenize(right);
89    let result = diff_slice(&left_tokens, &right_tokens);
90    let mut diff_line = DiffLine {
91        left_line_number: 1,
92        right_line_number: 1,
93        has_left_content: false,
94        has_right_content: false,
95        hunks: vec![],
96    };
97    for hunk in result {
98        match hunk {
99            diff::Result::Both(left, right) => {
100                assert!(left == right);
101                diff_line.has_left_content = true;
102                diff_line.has_right_content = true;
103                diff_line.hunks.push(DiffHunk::Unmodified(left.clone()));
104                if left == &[b'\n'] {
105                    callback(&diff_line);
106                    diff_line.left_line_number += 1;
107                    diff_line.right_line_number += 1;
108                    diff_line.reset_line();
109                }
110            }
111            diff::Result::Left(left) => {
112                diff_line.has_left_content = true;
113                diff_line.hunks.push(DiffHunk::Removed(left.clone()));
114                if left == &[b'\n'] {
115                    callback(&diff_line);
116                    diff_line.left_line_number += 1;
117                    diff_line.reset_line();
118                }
119            }
120            diff::Result::Right(right) => {
121                diff_line.has_right_content = true;
122                diff_line.hunks.push(DiffHunk::Added(right.clone()));
123                if right == &[b'\n'] {
124                    callback(&diff_line);
125                    diff_line.right_line_number += 1;
126                    diff_line.reset_line();
127                }
128            }
129        }
130    }
131    if !diff_line.hunks.is_empty() {
132        callback(&diff_line);
133    }
134}
135
136#[derive(PartialEq, Eq, Clone)]
137pub enum MergeHunk {
138    Resolved(Vec<u8>),
139    Conflict {
140        base: Vec<u8>,
141        left: Vec<u8>,
142        right: Vec<u8>,
143    },
144}
145
146impl Debug for MergeHunk {
147    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> {
148        match self {
149            MergeHunk::Resolved(data) => f
150                .debug_tuple("Resolved")
151                .field(&String::from_utf8_lossy(data))
152                .finish(),
153            MergeHunk::Conflict { base, left, right } => f
154                .debug_struct("Conflict")
155                .field("base", &String::from_utf8_lossy(base))
156                .field("left", &String::from_utf8_lossy(left))
157                .field("right", &String::from_utf8_lossy(right))
158                .finish(),
159        }
160    }
161}
162
163#[derive(Debug, PartialEq, Eq, Clone)]
164pub enum MergeResult {
165    Resolved(Vec<u8>),
166    Conflict(Vec<MergeHunk>),
167}
168
169/// Returns None if the merge fails
170pub fn merge(base: &[u8], left: &[u8], right: &[u8]) -> MergeResult {
171    let base_tokens = tokenize(base);
172    let left_tokens = tokenize(left);
173    let right_tokens = tokenize(right);
174
175    let left_diff = diff_slice(&base_tokens, &left_tokens);
176    let right_diff = diff_slice(&base_tokens, &right_tokens);
177
178    let mut hunk: Vec<u8> = vec![];
179    let mut hunks: Vec<MergeHunk> = vec![];
180    let mut left_it = left_diff.iter();
181    let mut right_it = right_diff.iter();
182
183    let mut left_hunk = left_it.next();
184    let mut right_hunk = right_it.next();
185    loop {
186        match (left_hunk, right_hunk) {
187            (None, None) => {
188                break;
189            }
190            (Some(diff::Result::Both(left_data_before, left_data_after)), _)
191                if left_data_before == left_data_after =>
192            {
193                // Left unmodified
194                match right_hunk.unwrap() {
195                    diff::Result::Both(right_data_before, right_data_after) => {
196                        // Left unmodified, right modified
197                        assert_eq!(left_data_before, right_data_before);
198                        hunk.append(&mut right_data_after.to_vec());
199                        left_hunk = left_it.next();
200                        right_hunk = right_it.next();
201                    }
202                    diff::Result::Left(right_data_before) => {
203                        // Left unmodified, right deleted
204                        assert_eq!(left_data_before, right_data_before);
205                        left_hunk = left_it.next();
206                        right_hunk = right_it.next();
207                    }
208                    diff::Result::Right(right_data_after) => {
209                        // Left unmodified, right inserted
210                        hunk.append(&mut right_data_after.to_vec());
211                        right_hunk = right_it.next();
212                    }
213                }
214            }
215            (_, Some(diff::Result::Both(right_data_before, right_data_after)))
216                if right_data_before == right_data_after =>
217            {
218                // Right unmodified
219                match left_hunk.unwrap() {
220                    diff::Result::Both(left_data_before, left_data_after) => {
221                        // Right unmodified, left modified
222                        assert_eq!(left_data_before, right_data_before);
223                        hunk.append(&mut left_data_after.to_vec());
224                        left_hunk = left_it.next();
225                        right_hunk = right_it.next();
226                    }
227                    diff::Result::Left(left_data_before) => {
228                        // Right unmodified, left deleted
229                        assert_eq!(left_data_before, right_data_before);
230                        left_hunk = left_it.next();
231                        right_hunk = right_it.next();
232                    }
233                    diff::Result::Right(left_data_after) => {
234                        // Right unmodified, left inserted
235                        hunk.append(&mut left_data_after.to_vec());
236                        left_hunk = left_it.next();
237                    }
238                }
239            }
240            (
241                Some(diff::Result::Left(left_data_before)),
242                Some(diff::Result::Left(right_data_before)),
243            ) => {
244                // Both deleted the same
245                assert_eq!(left_data_before, right_data_before);
246                left_hunk = left_it.next();
247                right_hunk = right_it.next();
248            }
249            (
250                Some(diff::Result::Right(left_data_after)),
251                Some(diff::Result::Right(right_data_after)),
252            ) => {
253                if left_data_after == right_data_after {
254                    // Both inserted the same
255                    hunk.append(&mut left_data_after.to_vec());
256                } else {
257                    // Each side inserted different
258                    if !hunk.is_empty() {
259                        hunks.push(MergeHunk::Resolved(hunk));
260                    }
261                    hunks.push(MergeHunk::Conflict {
262                        base: vec![],
263                        left: left_data_after.to_vec(),
264                        right: right_data_after.to_vec(),
265                    });
266                    hunk = vec![];
267                }
268                left_hunk = left_it.next();
269                right_hunk = right_it.next();
270            }
271            (Some(diff::Result::Right(left_data_after)), None) => {
272                // Left inserted at EOF
273                hunk.append(&mut left_data_after.to_vec());
274                left_hunk = left_it.next();
275            }
276            (None, Some(diff::Result::Right(right_data_after))) => {
277                // Right inserted at EOF
278                hunk.append(&mut right_data_after.to_vec());
279                right_hunk = right_it.next();
280            }
281            _ => {
282                panic!("unhandled merge case: {:?}, {:?}", left_hunk, right_hunk);
283            }
284        }
285    }
286    if hunks.is_empty() {
287        MergeResult::Resolved(hunk)
288    } else {
289        if !hunk.is_empty() {
290            hunks.push(MergeHunk::Resolved(hunk));
291        }
292        MergeResult::Conflict(hunks)
293    }
294}
295
296#[cfg(test)]
297mod tests {
298    use super::*;
299
300    #[test]
301    fn test_merge() {
302        assert_eq!(merge(b"", b"", b""), MergeResult::Resolved(b"".to_vec()));
303        assert_eq!(
304            merge(b"a", b"a", b"a"),
305            MergeResult::Resolved(b"a".to_vec())
306        );
307        assert_eq!(merge(b"a", b"", b"a"), MergeResult::Resolved(b"".to_vec()));
308        assert_eq!(merge(b"a", b"a", b""), MergeResult::Resolved(b"".to_vec()));
309        assert_eq!(merge(b"a", b"", b""), MergeResult::Resolved(b"".to_vec()));
310        assert_eq!(
311            merge(b"a", b"a b", b"a"),
312            MergeResult::Resolved(b"a b".to_vec())
313        );
314        assert_eq!(
315            merge(b"a", b"a", b"a b"),
316            MergeResult::Resolved(b"a b".to_vec())
317        );
318        assert_eq!(
319            merge(b"a", b"a b", b"a c"),
320            MergeResult::Conflict(vec![
321                MergeHunk::Resolved(b"a ".to_vec()),
322                MergeHunk::Conflict {
323                    base: b"".to_vec(),
324                    left: b"b".to_vec(),
325                    right: b"c".to_vec()
326                }
327            ])
328        );
329        assert_eq!(
330            merge(b"a", b"b", b"a"),
331            MergeResult::Resolved(b"b".to_vec())
332        );
333        assert_eq!(
334            merge(b"a", b"a", b"b"),
335            MergeResult::Resolved(b"b".to_vec())
336        );
337        // TODO: It seems like the a->b transition get reported as [Left(a),Right(b)]
338        // instead       of [Both(a,b)], so there is unexpectedly no conflict
339        // here
340        assert_eq!(merge(b"a", b"", b"b"), MergeResult::Resolved(b"b".to_vec()));
341        assert_eq!(merge(b"a", b"b", b""), MergeResult::Resolved(b"b".to_vec()));
342        assert_eq!(
343            merge(b"a", b"b", b"c"),
344            MergeResult::Conflict(vec![MergeHunk::Conflict {
345                base: b"".to_vec(),
346                left: b"b".to_vec(),
347                right: b"c".to_vec()
348            }])
349        );
350    }
351}