1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
//! Txtdist is small utility crate for calculating the distance between two strings.

/*
#![feature(test)]
extern crate test;
*/

use std::ops::{ Index, IndexMut };
use std::collections::BTreeMap;
use std::cmp::min;

// A simple wrapper around vec so we can get contiguous but index it like it's 2D array.
struct N2Array<T> {
    y_size: usize,
    buf: Vec<T>
}

impl<T: Clone> N2Array<T> {
    fn new(x: usize, y: usize, value: T) -> N2Array<T> {
        N2Array { y_size: y, buf: vec![value; x * y] }
    }
}

impl<T> Index<(usize, usize)> for N2Array<T> {
    type Output = T;

    #[inline]
    fn index<'a>(&'a self, (x, y): (usize, usize)) -> &'a T {        
        &self.buf[(x * self.y_size) + y]
    }
}

impl<T> IndexMut<(usize, usize)> for N2Array<T> {
    #[inline]
    fn index_mut<'a>(&'a mut self, (x, y): (usize, usize)) -> &'a mut T {
        &mut self.buf[(x * self.y_size) + y]
    }
}

/// Calculate the distance between two strings using the levenshtein algorithm.
/// 
/// > Levenshtein distance is a string metric for measuring the difference between two sequences. 
/// > Informally, the Levenshtein distance between two words is the minimum number of single-character edits 
/// > (i.e. insertions, deletions or substitutions) required to change one word into the other.
/// [wikipedia](http://en.wikipedia.org/wiki/Levenshtein_distance)
/// 
/// # Example
///
/// ```rust
/// use txtdist::levenshtein;
///
/// let distance = levenshtein("an act", "a cat");
/// assert_eq!(distance, 3)
/// ```
pub fn levenshtein(source: &str, target: &str) -> u32 {
    let (n, m) = (source.len(), target.len());

    let mut matrix = N2Array::new(n+1, m+1, 0);

    for i in 1..n+1 {
        matrix[(i, 0)] = i;
    }

    for i in 1..m+1 {
        matrix[(0, i)] = i;
    }

    for (row, char_s) in source.chars().enumerate() {
        let row = row + 1;
        
        for (col, char_t) in target.chars().enumerate() {
            let col = col + 1;
            if char_s == char_t {
                matrix[(row, col)] = matrix[(row-1, col-1)];
            } else {
                matrix[(row, col)] = min(matrix[(row-1, col)]   + 1, 
                                     min(matrix[(row, col-1)]   + 1,
                                         matrix[(row-1, col-1)] + 1));
            }
        }
    }

    matrix[(n, m)] as u32
}

/// Calculate the distance between two strings using the damerau levenshtein algorithm. 
/// 
/// > The Damerau–Levenshtein distance (named after Frederick J. Damerau and Vladimir I. Levenshtein) 
/// > is a distance (string metric) between two strings, i.e., finite sequence of symbols, 
/// > given by counting the minimum number of operations needed to transform one string into the other, 
/// > where an operation is defined as an insertion, deletion, or substitution of a single character, 
/// > or a transposition of two adjacent characters. 
/// [wikipedia](http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
/// 
/// # Example
///
/// ```rust
/// use txtdist::damerau_levenshtein;
///
/// let distance = damerau_levenshtein("an act", "a cat");
/// assert_eq!(distance, 2)
/// ```
pub fn damerau_levenshtein(source: &str, target: &str) -> u32 {
    let (n, m) = (source.len(), target.len());

    if n == 0 { return m as u32; }
    if m == 0 { return n as u32; }

    if n == m && source == target {
        return 0;
    }
        
    let inf = n + m;
    let mut matrix = N2Array::new(n+2, m+2, 0);

    matrix[(0, 0)] = inf;
    for i in 0..n+1 {
        matrix[(i+1, 0)] = inf;
        matrix[(i+1, 1)] = i;
    }
    for j in 0..m+1 {
        matrix[(0, j+1)] = inf;
        matrix[(1, j+1)] = j;
    };

    let mut last_row = BTreeMap::new();

    for (row, char_s) in source.chars().enumerate() {
        let mut last_match_col = 0;
        let row = row + 1;
        
        for (col, char_t) in target.chars().enumerate() {
            let col = col + 1;
            let last_match_row = *last_row.get(&char_t).unwrap_or(&0);
            let cost = if char_s == char_t { 0 } else { 1 };

            let dist_add = matrix[(row, col+1)] + 1;
            let dist_del = matrix[(row+1, col)] + 1;
            let dist_sub = matrix[(row, col)] + cost; 
            let dist_trans = matrix[(last_match_row, last_match_col)]
                            + (row - last_match_row - 1) + 1
                            + (col - last_match_col - 1);

            let dist = min(min(dist_add, dist_del), 
                           min(dist_sub, dist_trans));

            matrix[(row+1, col+1)] = dist;
            
            if cost == 0 {
                last_match_col = col;
            }
        }

        last_row.insert(char_s.clone(), row);
    }

    matrix[(n+1, m+1)] as u32
}

#[cfg(test)]
mod tests {
    use super::*;
    //use test::Bencher;

    #[test]
    fn test_levenschtein() {
        let distance = levenshtein("kitten", "sitting");
        assert_eq!(distance, 3);

        let distance = levenshtein("saturday", "sunday");
        assert_eq!(distance, 3);

        let distance = levenshtein("an act", "a cat");
        assert_eq!(distance, 3);

        let distance = levenshtein("", "AA");
        assert_eq!(distance, 2);

        let distance = levenshtein("string", "string");
        assert_eq!(distance, 0);
    }

    #[test]
    fn test_damerau_levenschtein() {
        let distance = damerau_levenshtein("CA", "ABC");
        assert_eq!(distance, 2);

        let distance = damerau_levenshtein("an act", "a cat");
        assert_eq!(distance, 2);

        let distance = damerau_levenshtein("", "a cat");
        assert_eq!(distance, 5);

        let distance = damerau_levenshtein("MERCEDES-BENS", "MERCEDES-BENZ");
        assert_eq!(distance, 1);

        let distance = damerau_levenshtein("", "");
        assert_eq!(distance, 0);

        let distance = damerau_levenshtein("some string", "some string");
        assert_eq!(distance, 0);
    }
    /*
    #[bench]
    fn bench_damerau_levenschtein(b: &mut Bencher) {
        b.iter(|| damerau_levenshtein("one string", "other string"));
    }

    #[bench]
    fn bench_damerau_levenschtein_same_length(b: &mut Bencher) {
        b.iter(|| damerau_levenshtein("one string12", "other string"));
    }

    #[bench]
    fn bench_damerau_levenschtein_empty(b: &mut Bencher) {
        b.iter(|| damerau_levenshtein("", "other string"));
    }

    #[bench]
    fn bench_damerau_levenschtein_same(b: &mut Bencher) {
        b.iter(|| damerau_levenshtein("some string", "some string"));
    }

    #[bench]
    fn bench_damerau_levenschtein_long(b: &mut Bencher) {
        let text1 = r"
            Lorem ipsum dolor sit amet, consectetur adipiscing elit. 
            Curabitur in maximus lectus. Nulla ornare metus sit amet congue feugiat. 
            Nunc sit amet mollis lectus. Integer vel mollis lacus. Nullam molestie justo dui, 
            vitae vulputate augue molestie nec. Sed non urna at augue aliquet feugiat eu ut diam. 
            Nam dignissim semper est, et dignissim lectus. Curabitur luctus mattis mauris. Lorem ipsum dolor sit amet, 
            consectetur adipiscing elit. Morbi at neque a leo mollis bibendum.

            Pellentesque mattis lacus et arcu fermentum, non volutpat orci tincidunt. 
            Nam et blandit magna. Vivamus dignissim in nisi at fringilla. 
            Donec pretium justo justo, vel placerat urna ultricies sit amet. Sed erat felis, 
            commodo non laoreet in, aliquet a lacus. Curabitur condimentum enim elit. 
            Curabitur tincidunt ligula ut quam rutrum fermentum. Donec laoreet mattis porttitor. 
            Etiam ornare urna in congue vehicula. Nam non metus sapien. 
            Morbi vel turpis volutpat, hendrerit augue sed, mattis est. 
            Sed auctor nibh lorem, vitae sagittis nibh egestas eget.
        ";

        let text2 = r"
            Lorem ipsum dolor sit amet, consectetur adipiscing elit. 
            Curabitur in maximus lectus. Nulla ornare metus sit amet congue feugiat. 
            Nunc sit amet mollis lectus. Integer vel mollis lacus. Nullam molestie justo dui, 
            vitae vulputate augue molestie nec. Sed non urna at augue aliquet feugiat eu ut diam. 
            Nam dignissim semper est, et eignissim lectus. Curabitur luctus mattis mauris. Lorem ipsum dolor sit amet, 
            consectetur adipiscing elit. Morbi at neque a leo mollis bibendum.

            Pellentesque mattis lacus et arcu fermentum, non volutpat orci tincidunt. 
            Nam et blandit magna. Vivamus dignmssim in nisi at fringilla. 
            Donec pretium justo justo, vel placerat urna ultricies sit amet. Sed erat felis, 
            commodo non laoreet in, aliquet a lacus. Curabitur condimentum enim elit. 
            Curabitur tincidunt ligula ut quam rutrum fermentum. Donec laoreet mattis porttitor.             
        ";

        b.iter(|| damerau_levenshtein(text1, text2))
    }*/
}