bio/pattern_matching/myers/
traceback.rs

1use std::default::Default;
2use std::iter;
3use std::marker::PhantomData;
4use std::ops::Range;
5
6use crate::alignment::AlignmentOperation;
7
8use crate::pattern_matching::myers::{word_size, BitVec, DistType, State};
9
10/// Objects implementing this trait handle the addition of calculated blocks (State<T, D>)
11/// to a container, and are responsible for creating the respective `TracebackHandler` object.
12pub(super) trait StatesHandler<'a, T, D>
13where
14    T: BitVec + 'a,
15    D: DistType,
16{
17    /// Object that helps obtaining a single traceback path
18    type TracebackHandler: TracebackHandler<'a, T, D>;
19    /// Type that represents a column in the DP matrix
20    type TracebackColumn: ?Sized;
21
22    /// Prepare for a new search given n (maximum expected number of traceback columns) and
23    /// m (pattern length).
24    /// Returns the expected size of the vector storing the calculated blocks given this
25    /// information. The vector will then be initialized with the given number of
26    /// State<T, D> objects and supplied to the other methods as slice.
27    fn init(&mut self, n: usize, m: D) -> usize;
28
29    /// The max. number of blocks needed for a DP matrix column
30    fn n_blocks(&self) -> usize;
31
32    /// Fill the column at `pos` with states initialized with the maximum distance
33    /// (`State::init_max_dist()`). The leftmost column of the traceback matrix needs this.
34    fn set_max_state(&self, pos: usize, states: &mut [State<T, D>]);
35
36    /// This method copies over all blocks (or the one block) from a tracback column
37    /// into the mutable `states` slice at the given column position.
38    fn add_state(&self, source: &Self::TracebackColumn, pos: usize, states: &mut [State<T, D>]);
39
40    /// Returns the edit distance at the given position, or `None` if `pos` is
41    /// out of bounds.
42    fn dist_at(&self, pos: usize, states: &[State<T, D>]) -> Option<D>;
43
44    /// Initiates a `TracebackHandler` object to assist with a traceback, 'starting'
45    /// at the given end position.
46    /// Should return `None` if the alignment cannot be obtained
47    /// (can happen with block-based algorithm if not all blocks were computed)
48    fn init_traceback(
49        &self,
50        m: D,
51        pos: usize,
52        states: &'a [State<T, D>],
53    ) -> Option<Self::TracebackHandler>;
54}
55
56/// `TracebackHandler` objects assist with the computation of a traceback path.
57///  The trait is implemented both for the simple and the block-based algorithms.
58///
59/// Implementors must keep track of two cells in the DP matrix:
60/// C = "current" cell
61/// L = "left" cell in the diagonal position
62///
63///   |————————————
64///   |   |   |   |
65///   |————————————
66///   |   | L |   |
67///   |————————————
68///   |   |   | C |
69///   |————————————
70///
71/// The idea behind this is, that diagonal moves in the DP matrix (= matches /
72/// substitutions) should be made as easy/fast as possible, while InDels
73/// are usually be less common (especially since there are no affine gap penalties)
74/// and upward/leftward moves can thus be more computationally intensive.
75///
76/// Implementors may store two `State<T, D>` blocks, whose `dist` field reflects
77/// the distance score of the current TB matrix cell, and along with that some auxiliary
78/// bit vectors indicating cursor position and assisting with adjustments to `dist` based
79/// deltas from the PV / MV bit vectors.
80pub(super) trait TracebackHandler<'a, T, D>
81where
82    T: BitVec + 'a,
83    D: DistType,
84{
85    /// Returns the distance score of the current cell C(i, j).
86    fn dist(&self) -> D;
87
88    /// Returns the distance score of the left cell, which is in the diagonal
89    /// position, C(i-1, j-1).
90    fn left_dist(&self) -> D;
91
92    /// Checks if the cell to the left C(i, j-1) has a smaller distance score than
93    /// the current cell C(i, j). If so, it calls `move_up()`.
94    fn try_move_up(&mut self) -> bool;
95
96    /// Moves up by one position in the DP matrix, adjusting the
97    /// distance scores/blocks/bit masks for the current and left columns.
98    fn move_up(&mut self);
99
100    /// Checks if the cell to the left C(i-1, j) has a smaller distance score than
101    /// the cell in the diagonal C(i-1, j-1). If so, adjusts the score/block
102    /// for the left column and returns `true`.
103    /// The "current" block needs no adjustment, as it will be discarded in
104    /// `finish_move_left()`, which needs to be called afterwards to complete
105    /// the move.
106    fn try_prepare_left(&mut self) -> bool;
107
108    /// Prepares for a diagonal move (which must be completed with `finish_move_left`).
109    /// Essentially, only bit masks and block indices need to be adjusted to
110    /// reflect the shift upwards by one position.
111    /// The blocks/distances themselves need no modification.
112    fn prepare_diagonal(&mut self);
113
114    /// Complete a 'move' to the left by
115    /// (1) replacing the current column with the left one (same block and score)
116    /// (2) adding a new column to the left and adjusting the distance score
117    /// to reflect the diagonal position.
118    fn finish_move_left(&mut self);
119
120    /// Returns true if topmost DP matrix row has been reached,
121    /// meaning that the complete alignment path has been found.
122    fn done(&self) -> bool;
123
124    /// For debugging only
125    #[allow(dead_code)]
126    fn print_state(&self);
127}
128
129#[derive(Clone, Debug)]
130pub(super) struct Traceback<'a, T, D, H>
131where
132    T: BitVec + 'a,
133    D: DistType,
134    H: StatesHandler<'a, T, D>,
135{
136    m: D,
137    positions: iter::Cycle<Range<usize>>,
138    handler: H,
139    pos: usize,
140    _t: PhantomData<&'a T>,
141}
142
143impl<'a, T, D, H> Traceback<'a, T, D, H>
144where
145    T: BitVec,
146    D: DistType,
147    H: StatesHandler<'a, T, D>,
148{
149    /// Creates a new `Traceback` instance.
150    /// The `states` vector is potentially reused, so all `State` instances
151    /// inside it must be initialized to new values.
152    #[inline]
153    pub fn new(
154        states: &mut Vec<State<T, D>>,
155        initial_state: &H::TracebackColumn,
156        num_cols: usize,
157        m: D,
158        mut handler: H,
159    ) -> Self {
160        // Correct traceback needs two additional columns at the left of the matrix (see below).
161        // Therefore reserving additional space.
162        let num_cols = num_cols + 2;
163
164        let n_states = handler.init(num_cols, m);
165
166        let mut tb = Traceback {
167            m,
168            positions: (0..num_cols).cycle(),
169            handler,
170            pos: 0,
171            _t: PhantomData,
172        };
173
174        // extend or truncate states vector
175        states.resize_with(n_states, Default::default);
176
177        // first column is used to ensure a correct path if the text (target)
178        // is shorter than the pattern (query)
179        tb.pos = tb.positions.next().unwrap();
180        tb.handler.set_max_state(tb.pos, states);
181
182        // initial state
183        tb.add_state(initial_state, states);
184
185        tb
186    }
187
188    #[inline]
189    pub fn add_state(&mut self, column: &H::TracebackColumn, states: &mut [State<T, D>]) {
190        self.pos = self.positions.next().unwrap();
191        self.handler.add_state(column, self.pos, states);
192    }
193
194    /// Returns distance for the given end position, or `None` if not stored
195    #[inline]
196    pub fn dist_at(&self, pos: usize, states: &'a [State<T, D>]) -> Option<D> {
197        let pos = pos + 2; // in order to be comparable since self.pos starts at 2, not 0
198        if pos <= self.pos {
199            return self.handler.dist_at(pos, states).map(|d| d as D);
200        }
201        None
202    }
203
204    /// Returns the length of the current match, optionally adding the
205    /// alignment path to `ops`
206    #[inline]
207    pub fn traceback(
208        &self,
209        ops: Option<&mut Vec<AlignmentOperation>>,
210        states: &'a [State<T, D>],
211    ) -> Option<(D, D)> {
212        self._traceback_at(self.pos, ops, states)
213    }
214
215    /// Returns the length of a match with a given end position, optionally adding the
216    /// alignment path to `ops`
217    /// only to be called if the `states` vec contains all states of the text
218    #[inline]
219    pub fn traceback_at(
220        &self,
221        pos: usize,
222        ops: Option<&mut Vec<AlignmentOperation>>,
223        states: &'a [State<T, D>],
224    ) -> Option<(D, D)> {
225        let pos = pos + 2; // in order to be comparable since self.pos starts at 2, not 0
226        if pos <= self.pos {
227            return self._traceback_at(pos, ops, states);
228        }
229        None
230    }
231
232    /// Returns a tuple of alignment length and hit distance, optionally adding the alignment path
233    /// to `ops`
234    #[inline]
235    fn _traceback_at(
236        &self,
237        pos: usize,
238        mut ops: Option<&mut Vec<AlignmentOperation>>,
239        state_slice: &'a [State<T, D>],
240    ) -> Option<(D, D)> {
241        use self::AlignmentOperation::*;
242
243        // self.print_tb_matrix(pos, state_slice);
244
245        // Generic object that holds the necessary data and methods
246        let mut h = self.handler.init_traceback(self.m, pos, state_slice)?;
247
248        // horizontal column offset from starting point in traceback matrix (bottom right)
249        let mut h_offset = D::zero();
250
251        // distance of the match (will be returned)
252        let dist = h.dist();
253
254        // Loop for finding the traceback path
255        while !h.done() {
256            let op;
257            // This loop is used to allow skipping `finish_move_left()` using break
258            // Like this, we avoid having to inline finish_move_left() three times.
259            #[allow(clippy::never_loop)]
260            loop {
261                // h.print_state();
262
263                // If there are several possible solutions, substitutions are preferred over InDels
264                // (Subst > Ins > Del)
265                if h.left_dist().wrapping_add(&D::one()) == h.dist() {
266                    // Diagonal (substitution), move up
267                    // Since the left cursor is always in the upper diagonal position,
268                    // a simple comparison of distances is enough to determine substitutions.
269                    h.prepare_diagonal();
270                    op = Subst;
271                    // then, move to left (below)
272                } else if h.try_move_up() {
273                    // Up (insertion to target = deletion in query pattern)
274                    op = Ins;
275                    break;
276                } else if h.try_prepare_left() {
277                    // Left (deletion in target text = insertion to query pattern)
278                    op = Del;
279                    // then, move to left (below)
280                } else {
281                    // Diagonal (match), move up
282                    debug_assert!(h.left_dist() == h.dist());
283                    h.prepare_diagonal();
284                    op = Match;
285                    // then, move to left (below)
286                }
287
288                // // This is (probably) equivalent to the Edlib strategy (Ins > Del > Subst)
289                // // Tests succeed with this strategy as well, and performance is similar.
290                // if h.try_move_up() {
291                //     op = Ins;
292                //     break;
293                // } else if h.left_dist() == h.dist() && h.try_prepare_left() {
294                //     op = Del;
295                // } else {
296                //     h.prepare_diagonal();
297                //     op = if h.left_dist().wrapping_add(&D::one()) == h.dist() {
298                //         Subst
299                //     } else {
300                //         Match
301                //     };
302                // }
303
304                // Moving one column to the left, adjusting h_offset
305                h_offset += D::one();
306                h.finish_move_left();
307                break;
308            }
309
310            // dbg!(op);
311
312            if let Some(o) = ops.as_mut() {
313                o.push(op);
314            }
315        }
316
317        Some((h_offset, dist))
318    }
319
320    // Useful for debugging
321    #[allow(dead_code)]
322    fn print_tb_matrix(&self, pos: usize, state_slice: &'a [State<T, D>]) {
323        let n_blocks = self.handler.n_blocks();
324        let pos = n_blocks * (pos + 1);
325        let states_iter = state_slice[..pos]
326            .chunks(n_blocks)
327            .rev()
328            .chain(state_slice.chunks(n_blocks).rev().cycle());
329        let m = self.m.to_usize().unwrap();
330        let mut out = vec![];
331        for col in states_iter {
332            let mut col_out = Vec::with_capacity(m);
333            let mut empty = true;
334            for (i, block) in col.iter().enumerate().rev() {
335                if !(block.is_new() || block.is_max()) {
336                    empty = false;
337                }
338                let w = word_size::<T>();
339                let end = (i + 1) * w;
340                let _m = if end <= m { w } else { m % w };
341                let mut _block = *block;
342                let mut pos_mask = T::one() << (_m - 1);
343                col_out.push(_block.dist);
344                for _ in 0.._m {
345                    // println!("{}\n{:064b}", _block, pos_mask);
346                    _block.adjust_one_up(pos_mask);
347                    pos_mask >>= 1;
348                    col_out.push(_block.dist);
349                }
350            }
351            out.push(col_out);
352            if empty {
353                break;
354            }
355        }
356
357        for j in (0..m).rev() {
358            eprint!("{:>4}: ", m - j + 1);
359            for col in out.iter().rev() {
360                if let Some(d) = col.get(j) {
361                    if *d >= (D::max_value() >> 1) {
362                        // missing value
363                        eprint!("    ");
364                    } else {
365                        eprint!("{:>4?}", d);
366                    }
367                } else {
368                    eprint!("   -");
369                }
370            }
371            eprintln!();
372        }
373    }
374}