1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
//! Used for finding the minimal set of operations to transform one string into another.
//!
//! The primary function of this module is [find diff](fn.find_diff.html).
use std::mem;
use std::cmp::max;
use super::{Diff};


/// Finds the difference on a character by character level between two strings
///
/// Uses the Hirschberg algorithm (doi: [10.1145/360825.360861](http://dx.doi.org/10.1145/360825.360861))
/// which operates in `O(x * y)` time and `O(y)` space.  The algorithm finds the minimal set of operations
/// that will transform 'old' into 'new'.  The 'weight' of each operation is determined by the `scorer.`
/// For more details about weighting, see the [OperationScore](trait.OperationScore.html) documentation.
///
/// The operations in the returned `Diff `are presented in file order, with offsets assuming the
/// previous operations have already been performed.  Furthermore, the inserts are assumed to
/// be performed prior to the deletes.
///
/// # Example
///
/// ```
/// use rdiff::string_diff::{find_diff, EditDistance};
/// // Find the difference between meadow and yellowing using the edit distance as the weighting.
/// let diff = find_diff("meadow", "yellowing", &EditDistance{});
/// // prints (0, 'y'), (3, 'll') and (9, 'ing')
/// for insert in diff.inserts() {
///     println!("{:?}", insert);
/// }
/// // prints (1, 1) and (4, 2)
/// for delete in diff.deletes() {
///     println!("{:?}", delete);
/// }
/// assert_eq!("yellowing", diff.apply_to_string("meadow").unwrap());
/// ```
pub fn find_diff<S: OperationScore>(old: &str, new: &str, scorer: &S) -> Diff {
    let mut diff = Diff::new();
    let mut insert_index = 0;
    let mut delete_index = 0;
    let old_rev = old.chars().rev().collect::<String>();
    let new_rev = new.chars().rev().collect::<String>();
    hirschberg(old, new, &old_rev, &new_rev, scorer, &mut diff, &mut insert_index, &mut delete_index);
    diff
}

/// Handles updating the diff and relevant indexes when inserting a string
/// Needed because the string must be converted to bytes before it can be used in the diff
macro_rules! do_insert {
    ($s: expr, $index: expr, $diff: expr) => (
        {
            let bytes = $s.bytes().collect::<Vec<_> >();
            let byte_len = bytes.len();
            $diff.add_insert(*$index, bytes);
            *$index += byte_len;
        }
    );
}

/// Handles updating the diff and relevant indexes when deleting a suvstring
/// Needed because the string must be converted to bytes before it can be used in the diff
macro_rules! do_delete {
    ($length: expr, $delete_index: expr, $insert_index: expr, $diff: expr) => (
        {
            $diff.add_delete(*$insert_index - *$delete_index, $length);
            *$delete_index += $length;
            *$insert_index += $length;
        }
    );
}

/// Uses the Hirschberg algorithm to calculate the optimal set of operations to transform 'old' into 'new'.
/// The only parameters that are input are 'old', 'new' and `scorer`.  `x_rev` and `y_rev` are just
/// cached so that 'old' and 'new' don't need to be reversed for every recursion of the algorithm.
/// `diff` is the output of the algorithm and `insert_index` and `delete_index` are simply intermediate state
/// being passed around.
fn hirschberg<S: OperationScore>(old: &str, new: &str, old_rev: &str, new_rev: &str, scorer: &S, diff: &mut Diff, insert_index: &mut usize, delete_index: &mut usize) {
    trace!("'{}' ({}) '{}' ({})", old, old_rev, new, new_rev);
    // We're going to use these lengths over and over again, we might as well cache them.
    let old_len = old.len();
    let new_len = new.len();

    // If one of the two strings is 0, then it's trvial to transform one into the other
    if old_len == 0 {
        do_insert!(new, insert_index, diff);
    } else if new_len == 0 {
        do_delete!(old_len, delete_index, insert_index, diff);
    }
    // If old is legnth 1, then there are two cases:
    else if old_len == 1 {
        let old_char = old.chars().next().unwrap();
        match new.chars().position(|c| c == old_char) {
            // Either new contains old, in which case
            Some(position) => {
                // We insert whatever is on the left of old in new
                if position > 0 {
                    do_insert!(new[..position], insert_index, diff);
                }
                *insert_index += 1;
                // and we insert whatever is on the right of old in new
                if new_len - position > 1 {
                    do_insert!(new[position + 1..], insert_index, diff);
                }
            } None => {
                //or new does not contain old, in which case
                // we simply delete old and insert new
                do_insert!(new, insert_index, diff);
                do_delete!(1, delete_index, insert_index, diff);
            }
        }
    }
    // If new is length 1, then there are two cases:
    else if new_len == 1 {
        let new_char = new.chars().next().unwrap();
        match old.chars().position(|c| c == new_char) {
            // either old contains new, in which case
            Some(position) => {
                // We delete everything in old to the left of new
                if position > 0 {
                    do_delete!(position, delete_index, insert_index, diff);
                }
                *insert_index += 1;
                // and we delete everything in old to the right of new
                if old_len - position > 1 {
                    let delete_len = old_len - position - 1;
                    do_delete!(delete_len, delete_index, insert_index, diff);
                }
            } None => {
                // or old does not contain new, in which case we simply insert new and delete
                // everything that was previously in old
                do_insert!(new, insert_index, diff);
                do_delete!(old_len, delete_index, insert_index, diff);
            }
        }
    } else {
        // If it's not trivial, then we recurse until it is.
        // We begin bnew dividing old in half.
        let old_mid = old_len / 2;
        // We then find the index in new where splitting the string will give us the
        // highest possible score.  This index is the point where the trace of the edit
        // operations performed is guaranteed to cross.
        let score_l = nw_score(&old[..old_mid], new, scorer);
        let score_r = nw_score(&old_rev[..old_len - old_mid], new_rev, scorer);
        let new_mid = score_l.iter()
                            .zip(score_r.iter().rev())
                            .map(|(l, r)| l + r)
                            .zip(0..new_len + 1).max().unwrap().1;
        // We then recurse on the left side of old and new
        hirschberg(&old[..old_mid], &new[..new_mid], &old_rev[old_len - old_mid..], &new_rev[new_len - new_mid..], scorer, diff, insert_index, delete_index);
        // and the right side of old and new
        hirschberg(&old[old_mid..], &new[new_mid..], &old_rev[..old_len - old_mid], &new_rev[..new_len - new_mid], scorer, diff, insert_index, delete_index);


    }

}

/// Used to calculate the score for each operation that
/// will be performed.  The score can be static, or it can
/// vary based on which character is being deleted inserted or substituted.
/// It is highly recommended to inline the implementation of these characters
pub trait OperationScore {
    /// The score for inserting character `c` into the string
    fn insert_score(&self, c: char) -> i32;
    /// The score for deleting character `c` from the string
    fn delete_score(&self, c: char) -> i32;
    /// The score for replacing character `old` with character `new`
    fn substitution_score(&self, old: char, new: char) -> i32;
    /// The score for when a character is one string matches the character in the other string
    fn match_score(&self, c: char) -> i32;
}

/// Used as the classiscal definition of edit distance.
///
/// That is:
///
/// * Insert is cost -1
/// * Delete is cost -1
/// * Substitution is cost -2 (an insert + a delete)
/// * Matching is cost 0
pub struct EditDistance;

impl OperationScore for EditDistance {
    #[inline]
    fn insert_score(&self, _: char) -> i32 {
        -1
    }

    #[inline]
    fn delete_score(&self, _: char) -> i32 {
        -1
    }

    #[inline]
    fn substitution_score(&self, _: char, _: char) -> i32 {
        -2
    }

    #[inline]
    fn match_score(&self, _: char) -> i32 {
        0
    }
}

/// Calculate the score based on the Needleman-Wunsch algorithm.  This algorithm
/// calculates the cost of transforming string 'old' into string 'new' using operation scoring
/// given by `scorer`.
///
/// It operates by iteratively generating the score for progressively longer
/// substrings of 'old' and 'new'.  The result is a vector of the transformation score
/// from 'old' to a substring of length `i` of 'new' where `i` is the index of an element in
/// the resulting vector.
fn nw_score<S: OperationScore>(old: &str, new: &str, scorer: &S) -> Vec<i32> {

    trace!("nw_score for '{}' - '{}'", old, new);
    let row_len = new.len() + 1;
    let mut last_row = Vec::with_capacity(row_len);
    let mut this_row = Vec::with_capacity(row_len);
    let mut total_insert = 0;
    last_row.push(0);
    for new_char in new.chars() {
        total_insert += scorer.insert_score(new_char);
        last_row.push(total_insert);
    }
    trace!("{:?}", last_row);
    for old_char in old.chars() {
        this_row.push(last_row[0] + scorer.delete_score(old_char));
        for (new_index, new_char) in new.chars().enumerate() {
            let score_sub = last_row[new_index] + if old_char == new_char {
                scorer.match_score(old_char)
            } else {
                scorer.substitution_score(old_char, new_char)
            };
            let score_del = last_row[new_index + 1] + scorer.delete_score(old_char);
            let score_ins = this_row[new_index] + scorer.insert_score(new_char);
            this_row.push(max(max(score_sub, score_del), score_ins))
        }
        trace!("{:?}", this_row);
        last_row = mem::replace(&mut this_row, Vec::with_capacity(row_len));
    }
    last_row

}

#[cfg(test)]
mod test {
    extern crate env_logger;
    use super::{nw_score, find_diff, EditDistance, OperationScore};
    use super::super::{Insert, Delete, Diff};

    struct ExampleScores;

    macro_rules! check_diff {
        ($start: tt |  $new: tt | $scorer: tt | $(($insert_pos : tt, $insert_value: tt)),* | $(($delete_pos: tt, $delete_len: tt)),*) => {
            {
                check_diff_workaround!($start; $new; $scorer; $(($insert_pos, $insert_value)),*; $(($delete_pos, $delete_len)),*)
            }
        };
    }

    // Caused by a bug in the implementation of the tt macro type.  It currently has to be passed as an expr into another macro
    // or it throws a fit for no reason.  See https://github.com/rust-lang/rust/issues/5846
    macro_rules! check_diff_workaround {
        ($start: expr ; $new: expr ; $scorer: expr; $(($insert_pos : tt, $insert_value: tt)),* ; $(($delete_pos: tt, $delete_len: tt)),*) => {
            {
                let diff = find_diff($start, $new, &$scorer);
                assert_eq!(Diff {
                    inserts: vec![$(Insert{position: $insert_pos, data: $insert_value.bytes().collect()}),*],
                    deletes: vec![$(Delete{position: $delete_pos, len: $delete_len}),*]
                }, diff);
                assert_eq!(diff.apply_to_string($start).unwrap(), $new.to_string());
            }
        };
    }

    // From the wikipedia example at https://en.wikipedia.org/wiki/Hirschberg%27s_algorithm
    impl OperationScore for ExampleScores {
        #[inline]
        fn insert_score(&self, _: char) -> i32 {
            -2
        }

        #[inline]
        fn delete_score(&self, _: char) -> i32 {
            -2
        }

        #[inline]
        fn substitution_score(&self, _: char, _: char) -> i32 {
            -1
        }

        #[inline]
        fn match_score(&self, _: char) -> i32 {
            2
        }
    }

    #[test]
    fn score() {
        assert_eq!(nw_score("ACGC", "CGTAT", &EditDistance{}), vec![-4, -3, -2, -3, -4, -5]);
        assert_eq!(nw_score("AGTA", "TATGC", &EditDistance{}), vec![-4, -3, -2, -3, -4, -5]);

        assert_eq!(nw_score("ACGC", "CGTAT", &ExampleScores{}), vec![-8, -4, 0, 1, -1, -3]);
        assert_eq!(nw_score("AGTA", "TATGC", &ExampleScores{}), vec![-8, -4, 0, -2, -1, -3]);
    }

    #[test]
    fn do_find_diff() {
        //env_logger::init().unwrap();
        check_diff!(
            "kitten" |
            "kettle" |
            EditDistance |
            (1, "e"), (5, "l") |
            (2, 1), (6, 1)
        );
        check_diff!(
            "meadow" |
            "yellowing" |
            EditDistance |
            (0, "y"), (3, "ll"), (9, "ing") |
            (1, 1), (4, 2)
        );

        check_diff!(" I've" |
                    " I" |
                    EditDistance |
                    |
                    (2, 3)
                );

        check_diff!(" I've got a new place" |
                    " I found a new place" |
                    EditDistance |
                    (6, "f"), (9, "und") |
                    (2, 3), (4, 1), (8, 1)
                );
        check_diff!(
            "Since my baby left me I've got a new place to dwell\nI walk down a lonely street to Heartbreak Hotel." |
            "Since my baby left me I found a new place to dwell\nDown at the end of 'Lonely Street' to 'Heartbreak Hotel.'" |
            EditDistance |
            (27, "f"), (30, "und"), (56, "Down"), (64, "t the"), (72, "en"), (75, " "), (77, "f"), (81, "'L"), (92, "S"), (99, "'"),  (104, "'"), (122, "'") |
            (23, 3), (25, 1), (29, 1),(55, 1), (56, 1), (62, 2), (69, 2), (72, 3), (79, 1)
        );
    }
}