seqdiff/
lib.rs

1//! Functions to get correspondence between two sequences like diff,
2//! based on Myers' algorithm.
3#[cfg(test)]
4mod tests;
5use std::cmp::{max, min};
6#[cfg(test)]
7extern crate quickcheck;
8#[cfg(test)]
9#[macro_use(quickcheck)]
10extern crate quickcheck_macros;
11use std::isize::{MAX, MIN};
12
13/// An alias for the result of diff type
14pub type Diff = Vec<Option<usize>>;
15
16struct Difference<'a, X, Y> {
17    xv: &'a [X],
18    yv: &'a [Y],
19
20    // working memory for forward path
21    vf: Vec<isize>,
22    // working memory for backward path
23    vb: Vec<isize>,
24    offset_d: isize,
25
26    // edit script for xv
27    xe: Vec<Option<usize>>,
28    // edit script for yv
29    ye: Vec<Option<usize>>,
30}
31
32impl<'a, X, Y> Difference<'a, X, Y>
33where
34    X: PartialEq<Y>,
35{
36    fn new(xv: &'a [X], yv: &'a [Y]) -> Self {
37        let dmax = xv.len() + yv.len() + 1;
38        let offset_d = yv.len() as isize;
39        let vf = vec![MIN; dmax];
40        let vb = vec![MAX; dmax];
41        let xe = vec![None; xv.len()];
42        let ye = vec![None; yv.len()];
43        Self {
44            xv,
45            yv,
46            vf,
47            vb,
48            offset_d,
49            xe,
50            ye,
51        }
52    }
53
54    fn diff(&mut self) -> usize {
55        self.diff_part((0, self.xv.len()), (0, self.yv.len()))
56    }
57
58    fn diff_part(
59        &mut self,
60        (mut xl, mut xr): (usize, usize),
61        (mut yl, mut yr): (usize, usize),
62    ) -> usize {
63        // shrink by equality
64        while xl < xr && yl < yr && self.xv[xl] == self.yv[yl] {
65            self.xe[xl] = Some(yl);
66            self.ye[yl] = Some(xl);
67            xl += 1;
68            yl += 1;
69        }
70        // same as backward
71        while xl < xr && yl < yr && self.xv[xr - 1] == self.yv[yr - 1] {
72            xr -= 1;
73            yr -= 1;
74            self.xe[xr] = Some(yr);
75            self.ye[yr] = Some(xr);
76        }
77
78        // process simple case
79        if xl == xr {
80            self.ye[yl..yr].iter_mut().for_each(|x| *x = None);
81            yr - yl
82        } else if yl == yr {
83            self.xe[xl..xr].iter_mut().for_each(|x| *x = None);
84            xr - xl
85
86        // divide and conquer
87        } else {
88            let (d, (xm, ym)) = self.find_mid((xl, xr), (yl, yr));
89            self.diff_part((xl, xm), (yl, ym));
90            self.diff_part((xm, xr), (ym, yr));
91            d
92        }
93    }
94
95    #[allow(clippy::many_single_char_names)]
96    fn find_mid(
97        &mut self,
98        (xl, xr): (usize, usize),
99        (yl, yr): (usize, usize),
100    ) -> (usize, (usize, usize)) {
101        let xl = xl as isize;
102        let xr = xr as isize;
103        let yl = yl as isize;
104        let yr = yr as isize;
105
106        let kmin = xl - yr;
107        let kmax = xr - yl;
108        let kmidf = xl - yl; // center diag in this fragment for forwad snake
109        let kmidb = xr - yr;
110        let delta = (xr - xl) - (yr - yl);
111        let is_odd = (delta & 1) == 1;
112
113        // convert k to index of working memory (vf, vb)
114        let ktoi = {
115            let offset = self.offset_d;
116            move |k: isize| -> usize { (k + offset) as usize }
117        };
118
119        self.vf[ktoi(kmidf)] = xl;
120        self.vb[ktoi(kmidb)] = xr;
121
122        let mut kminf = kmidf;
123        let mut kmaxf = kmidf;
124        let mut kminb = kmidb;
125        let mut kmaxb = kmidb;
126
127        let gety = |x: isize, k: isize| x.saturating_sub(k);
128
129        for d in 1i64.. {
130            // We don't have to check the case `d == 0` because it is handled in `fn diff_part`
131
132            // forward
133            {
134                // update range
135                if kminf > kmin {
136                    kminf -= 1;
137                    if let Some(x) = self.vf.get_mut(ktoi(kminf - 1)) {
138                        *x = MIN;
139                    }
140                } else {
141                    kminf += 1;
142                }
143                if kmaxf < kmax {
144                    kmaxf += 1;
145                    if let Some(x) = self.vf.get_mut(ktoi(kmaxf + 1)) {
146                        *x = MIN;
147                    }
148                } else {
149                    kmaxf -= 1
150                }
151
152                for k in (kminf..=kmaxf).rev().step_by(2) {
153                    let ik = ktoi(k);
154                    let x = {
155                        let lo = self.vf.get(ktoi(k - 1)).cloned();
156                        let hi = self.vf.get(ktoi(k + 1)).cloned();
157                        max(lo.map(|x| x + 1), hi).unwrap()
158                    };
159                    let y = gety(x, k);
160                    if !(xl <= x && x <= xr && yl <= y && y <= yr) {
161                        continue;
162                    }
163
164                    // go forward in diagonal path
165                    let (u, v) = {
166                        let mut u = x;
167                        let mut v = y;
168                        let len = self.xv[u as usize..xr as usize]
169                            .iter()
170                            .zip(self.yv[v as usize..yr as usize].iter())
171                            .take_while(|(x, y)| x == y)
172                            .count() as isize;
173                        u += len;
174                        v += len;
175                        (u, v)
176                    };
177
178                    debug_assert!(xl <= u && u <= xr);
179                    debug_assert!(yl <= v && v <= yr);
180
181                    self.vf[ik] = u;
182                    if is_odd && kminb <= k && k <= kmaxb && self.vb[ik] <= u {
183                        return (2 * d as usize - 1, (x as usize, y as usize));
184                    }
185                }
186            }
187
188            // backward
189            {
190                // update range
191                if kminb > kmin {
192                    kminb -= 1;
193                    if let Some(x) = self.vb.get_mut(ktoi(kminb - 1)) {
194                        *x = MAX;
195                    }
196                } else {
197                    kminb += 1;
198                }
199                if kmaxb < kmax {
200                    kmaxb += 1;
201                    if let Some(x) = self.vb.get_mut(ktoi(kmaxb + 1)) {
202                        *x = MAX;
203                    }
204                } else {
205                    kmaxb -= 1
206                }
207
208                for k in (kminb..=kmaxb).rev().step_by(2) {
209                    let x = {
210                        let lo = self.vb.get(ktoi(k - 1)).cloned();
211                        let hi = self.vb.get(ktoi(k + 1)).cloned();
212                        match (lo, hi.map(|x| x - 1)) {
213                            (Some(lo), Some(hi)) => min(lo, hi),
214                            (Some(lo), _) => lo,
215                            (_, Some(hi)) => hi,
216                            _ => unreachable!(),
217                        }
218                    };
219                    let y = gety(x, k);
220                    if !(xl <= x && x <= xr && yl <= y && y <= yr) {
221                        continue;
222                    }
223
224                    // go backward in diagonal path
225                    let (u, v) = {
226                        let mut u = x;
227                        let mut v = y;
228                        let len = self.xv[xl as usize..u as usize]
229                            .iter()
230                            .rev()
231                            .zip(self.yv[yl as usize..v as usize].iter().rev())
232                            .take_while(|(x, y)| x == y)
233                            .count() as isize;
234                        u -= len;
235                        v -= len;
236                        (u, v)
237                    };
238                    debug_assert!(xl <= u && u <= xr);
239                    debug_assert!(yl <= v && v <= yr);
240
241                    let ik = ktoi(k);
242                    self.vb[ik] = u;
243                    if !is_odd && kminf <= k && k <= kmaxf && self.vf[ik] >= u {
244                        return (2 * d as usize, (x as usize, y as usize));
245                    }
246                }
247            }
248        }
249
250        unreachable!();
251    }
252}
253
254/// Returns the correspondence between two sequences.
255///
256/// The return value is a pair of tuples. The first tuple contains the index
257/// where the item from the first sequence appears in the 2nd sequence or `None`
258/// if the item doesn't appear in the 2nd sequence. The 2nd tuple is the same
259/// but listing the corresponding indexes for the 2nd sequence in the first
260/// sequence.
261///
262/// # Examples
263///
264/// ```
265/// use seqdiff;
266/// let (a2b, b2a) = seqdiff::diff(&[1, 2, 3], &[1, 3]);
267/// assert_eq!(a2b, vec![Some(0), None, Some(1)]);
268/// assert_eq!(b2a, vec![Some(0), Some(2)]);
269/// ```
270pub fn diff<A: PartialEq<B>, B>(a: &[A], b: &[B]) -> (Diff, Diff) {
271    let mut st = Difference::new(a, b);
272    st.diff();
273    let Difference { xe, ye, .. } = st;
274    (xe, ye)
275}
276
277/// Compute similarity of two sequences.
278/// The similarity is a floating point number in [0., 100.], computed based on
279/// Levenshtein distance.
280/// This is useful, for example, fuzzy search.
281///
282/// # Examples
283///
284/// ```
285/// use seqdiff::ratio;
286/// let r = ratio(
287///     &"Hello world!".chars().collect::<Vec<_>>(),
288///     &"Holly grail!".chars().collect::<Vec<_>>(),
289/// );
290/// assert!((r - 58.333333333333337).abs() < 1e-5);
291/// ```
292#[allow(clippy::many_single_char_names)]
293pub fn ratio<A: PartialEq<B>, B>(a: &[A], b: &[B]) -> f64 {
294    let l = a.len() + b.len();
295    if l == 0 {
296        return 100.;
297    }
298    let dist = Difference::new(a, b).diff();
299    let ret = l - dist;
300    (ret * 100) as f64 / l as f64
301}