diff_match_patch_rs/
traits.rs

1use std::hash::Hash;
2
3use percent_encoding::{percent_decode, AsciiSet, CONTROLS};
4
5use crate::{
6    dmp::{Diff, DiffMatchPatch, Time},
7    Ops,
8};
9
10pub type Efficient = u8;
11pub type Compat = char;
12
13// Appending controls to ensure exact same encoding as cpp variant
14const ENCODE_SET: &AsciiSet = &CONTROLS
15    .add(b'"')
16    .add(b'<')
17    .add(b'>')
18    .add(b'`')
19    .add(b'{')
20    .add(b'}')
21    .add(b'%')
22    .add(b'[')
23    .add(b'\\')
24    .add(b']')
25    .add(b'^')
26    .add(b'|');
27
28pub trait DType: Copy + Ord + Eq + Hash {
29    fn bisect_split(
30        dmp: &DiffMatchPatch,
31        old: &[Self],
32        new: &[Self],
33        x: usize,
34        y: usize,
35        deadline: Option<Time>,
36    ) -> Result<Vec<Diff<Self>>, crate::errors::Error> {
37        let (old_a, new_a, old_b, new_b) = if x <= old.len() && y <= new.len() {
38            (&old[..x], &new[..y], &old[x..], &new[y..])
39        } else {
40            return Err(crate::errors::Error::InvalidInput);
41        };
42
43        // Compute both diffs serially.
44        let mut diffs_a = dmp.diff_internal(old_a, new_a, false, deadline)?;
45        diffs_a.append(&mut dmp.diff_internal(old_b, new_b, false, deadline)?);
46
47        Ok(diffs_a)
48    }
49
50    fn from_char(c: char) -> Self;
51    fn as_char(&self) -> Option<char>;
52    fn from_str(str: &str) -> Vec<Self>;
53    fn to_string(data: &[Self]) -> Result<String, crate::Error>;
54
55    fn is_whitespace(self) -> bool {
56        unimplemented!()
57    }
58    fn is_newline(self) -> bool {
59        unimplemented!()
60    }
61    fn is_carriage(self) -> bool {
62        unimplemented!()
63    }
64    fn is_alphanum(self) -> bool {
65        unimplemented!()
66    }
67
68    fn is_linebreak_end(input: &[Self]) -> bool;
69    fn is_linebreak_start(input: &[Self]) -> bool;
70
71    fn percent_encode(input: &[Self]) -> Vec<Self>;
72    fn percent_decode(input: &[Self]) -> Vec<Self>;
73
74    fn humanize(_diffs: &mut Vec<Diff<Self>>) -> Result<(), crate::Error> {
75        Ok(())
76    }
77}
78
79impl DType for u8 {
80    fn from_char(c: char) -> Self {
81        c as u8
82    }
83
84    fn as_char(&self) -> Option<char> {
85        Some(*self as char)
86    }
87
88    fn from_str(str: &str) -> Vec<Self> {
89        str.as_bytes().to_vec()
90    }
91
92    fn to_string(data: &[Self]) -> Result<String, crate::Error> {
93        std::str::from_utf8(data)
94            .map_err(|_| crate::Error::Utf8Error)
95            .map(|s| s.to_string())
96    }
97
98    fn is_whitespace(self) -> bool {
99        char::is_whitespace(self.into())
100    }
101
102    fn is_newline(self) -> bool {
103        char::is_newline(self.into())
104    }
105
106    fn is_carriage(self) -> bool {
107        self == b'\r'
108    }
109
110    fn is_alphanum(self) -> bool {
111        char::is_alphanumeric(self.into())
112    }
113
114    fn is_linebreak_end(input: &[Self]) -> bool {
115        input.ends_with(b"\n\n") || input.ends_with(b"\n\r\n")
116    }
117
118    fn is_linebreak_start(input: &[Self]) -> bool {
119        input.starts_with(b"\r\n\n")
120            || input.starts_with(b"\r\n\r\n")
121            || input.starts_with(b"\n\r\n")
122            || input.starts_with(b"\n\n")
123    }
124
125    fn percent_encode(input: &[Self]) -> Vec<Self> {
126        percent_encoding::percent_encode(input, ENCODE_SET)
127            .collect::<String>()
128            .as_bytes()
129            .to_vec()
130    }
131
132    fn percent_decode(input: &[Self]) -> Vec<Self> {
133        percent_decode(input).collect()
134    }
135
136    fn humanize(diffs: &mut Vec<Diff<Self>>) -> Result<(), crate::Error> {
137        let mut idx = 0_usize;
138        let mut err_prefix = vec![];
139
140        let mut err_start = None;
141
142        // First pass, we'll chomp of errors in the diffs?
143        // The pattern we have seen is that
144        while idx < diffs.len() {
145            let diff = &mut diffs[idx];
146
147            if let Err(e) = std::str::from_utf8(diff.data()) {
148                // Errors can come in 2 forms
149                // 1. error at the end of bytes - we'll keep prefixing the error bytes to all non equalities that follow
150                // 2. error at the begining of bytes - this one is tricky - we'll need to figure out the suffix at which the rest of the string is valid
151                if e.error_len().is_none() && err_start.is_none() {
152                    err_start = Some(idx);
153
154                    if diff.op() == Ops::Equal {
155                        err_prefix = diff.data()[e.valid_up_to()..].to_vec();
156                        diff.1 = if e.valid_up_to() > 0 {
157                            diff.data()[..e.valid_up_to()].to_vec()
158                        } else {
159                            vec![]
160                        };
161
162                        idx += 1;
163                        continue;
164                    }
165                }
166
167                if let Some(err_start_idx) = err_start {
168                    // For insert and delete add the prefix collected earlier (end error bytes)
169                    if diff.op() == Ops::Delete || diff.op() == Ops::Insert {
170                        diff.1 = [&err_prefix, diff.data()].concat();
171                    } else {
172                        if let Some(err_len) = e.error_len() {
173                            // Iteratively figure out at what point does the error go away if at-all
174                            let mut suffix = diff.data()[..err_len].to_vec();
175                            let mut data = diff.data()[err_len..].to_vec();
176
177                            while let Err(e) = std::str::from_utf8(&data) {
178                                if e.error_len().is_none() {
179                                    break;
180                                }
181
182                                // should never panic cos empty data is also a valid utf8
183                                let first_byte = data.remove(0);
184                                suffix.push(first_byte);
185                            }
186
187                            // here, we have a suffix to be added to all previous cases and a data that might be good string or error at the end of bytes
188                            // which is a separate cycle
189
190                            // Let's add the suffix to all the intermediate steps
191                            diff.1 = data.to_vec();
192                            diffs
193                                .iter_mut()
194                                .take(idx)
195                                .skip(err_start_idx)
196                                .for_each(|d| {
197                                    if d.op() == Ops::Equal {
198                                        return;
199                                    }
200                                    d.1 = [d.data(), &suffix[..]].concat();
201                                });
202
203                            // An equality within edits, lets seek the next one and update this suffix too
204                            if data.is_empty() {
205                                if idx < diffs.len() - 1 && diffs[idx + 1].op() != Ops::Equal {
206                                    diffs[idx + 1].1 =
207                                        [&err_prefix[..], &suffix, diffs[idx + 1].data()].concat();
208                                }
209
210                                diffs.remove(idx);
211                            }
212                        }
213
214                        // Move back to where all of this started
215                        idx = err_start_idx;
216                        err_start = None;
217                        err_prefix = vec![];
218                        continue;
219                    }
220                }
221            }
222            idx += 1;
223        }
224
225        Ok(())
226    }
227}
228
229impl DType for char {
230    fn from_char(c: char) -> Self {
231        c
232    }
233
234    fn as_char(&self) -> Option<char> {
235        Some(*self)
236    }
237
238    fn from_str(str: &str) -> Vec<Self> {
239        str.chars().collect::<Vec<_>>()
240    }
241
242    fn to_string(data: &[Self]) -> Result<String, crate::Error> {
243        Ok(data.iter().collect::<String>())
244    }
245
246    fn is_whitespace(self) -> bool {
247        char::is_whitespace(self)
248    }
249
250    fn is_newline(self) -> bool {
251        self == '\n'
252    }
253
254    fn is_carriage(self) -> bool {
255        self == '\r'
256    }
257
258    fn is_alphanum(self) -> bool {
259        self.is_alphanumeric()
260    }
261
262    fn is_linebreak_end(input: &[Self]) -> bool {
263        input.ends_with(&['\n', '\n']) || input.ends_with(&['\n', '\r', '\n'])
264    }
265
266    fn is_linebreak_start(input: &[Self]) -> bool {
267        input.starts_with(&['\r', '\n', '\n'])
268            || input.starts_with(&['\r', '\n', '\r', '\n'])
269            || input.starts_with(&['\n', '\r', '\n'])
270            || input.starts_with(&['\n', '\n'])
271    }
272
273    fn percent_encode(input: &[Self]) -> Vec<Self> {
274        let d = input
275            .iter()
276            .map(|c| {
277                let mut b = vec![0; c.len_utf8()];
278                c.encode_utf8(&mut b);
279
280                b
281            })
282            .collect::<Vec<_>>()
283            .concat();
284
285        let encoded = percent_encoding::percent_encode(&d[..], ENCODE_SET).collect::<String>();
286
287        Self::from_str(&encoded)
288    }
289
290    fn percent_decode(input: &[Self]) -> Vec<Self> {
291        let ip = input.iter().collect::<String>();
292        percent_decode(ip.as_bytes())
293            .decode_utf8()
294            .unwrap()
295            .chars()
296            .collect()
297    }
298}
299
300impl DType for usize {
301    fn bisect_split(
302        dmp: &DiffMatchPatch,
303        old: &[usize],
304        new: &[usize],
305        x: usize,
306        y: usize,
307        deadline: Option<Time>,
308    ) -> Result<Vec<Diff<usize>>, crate::errors::Error> {
309        let (old_a, new_a, old_b, new_b) = if x <= old.len() && y <= new.len() {
310            (&old[..x], &new[..y], &old[x..], &new[y..])
311        } else {
312            return Err(crate::errors::Error::InvalidInput);
313        };
314
315        // Compute both diffs serially.
316        let mut diffs_a = dmp.diff_lines(old_a, new_a, deadline)?;
317        diffs_a.append(&mut dmp.diff_lines(old_b, new_b, deadline)?);
318
319        Ok(diffs_a)
320    }
321
322    fn from_char(c: char) -> Self {
323        (c as u8) as usize
324    }
325
326    fn as_char(&self) -> Option<char> {
327        char::from_digit(*self as u32, 10)
328    }
329
330    fn from_str(_: &str) -> Vec<Self> {
331        unimplemented!()
332    }
333
334    fn to_string(_: &[Self]) -> Result<String, crate::Error> {
335        unimplemented!()
336    }
337
338    fn is_linebreak_end(_: &[Self]) -> bool {
339        unimplemented!()
340    }
341
342    fn is_linebreak_start(_: &[Self]) -> bool {
343        unimplemented!()
344    }
345
346    fn percent_encode(_: &[Self]) -> Vec<Self> {
347        unimplemented!()
348    }
349
350    fn percent_decode(_: &[Self]) -> Vec<Self> {
351        unimplemented!()
352    }
353}