bio/pattern_matching/myers/traceback.rs
1use std::default::Default;
2use std::iter;
3use std::marker::PhantomData;
4use std::ops::Range;
5
6use crate::alignment::AlignmentOperation;
7
8use crate::pattern_matching::myers::{word_size, BitVec, DistType, State};
9
10/// Objects implementing this trait handle the addition of calculated blocks (State<T, D>)
11/// to a container, and are responsible for creating the respective `TracebackHandler` object.
12pub(super) trait StatesHandler<'a, T, D>
13where
14 T: BitVec + 'a,
15 D: DistType,
16{
17 /// Object that helps obtaining a single traceback path
18 type TracebackHandler: TracebackHandler<'a, T, D>;
19 /// Type that represents a column in the DP matrix
20 type TracebackColumn: ?Sized;
21
22 /// Prepare for a new search given n (maximum expected number of traceback columns) and
23 /// m (pattern length).
24 /// Returns the expected size of the vector storing the calculated blocks given this
25 /// information. The vector will then be initialized with the given number of
26 /// State<T, D> objects and supplied to the other methods as slice.
27 fn init(&mut self, n: usize, m: D) -> usize;
28
29 /// The max. number of blocks needed for a DP matrix column
30 fn n_blocks(&self) -> usize;
31
32 /// Fill the column at `pos` with states initialized with the maximum distance
33 /// (`State::init_max_dist()`). The leftmost column of the traceback matrix needs this.
34 fn set_max_state(&self, pos: usize, states: &mut [State<T, D>]);
35
36 /// This method copies over all blocks (or the one block) from a tracback column
37 /// into the mutable `states` slice at the given column position.
38 fn add_state(&self, source: &Self::TracebackColumn, pos: usize, states: &mut [State<T, D>]);
39
40 /// Returns the edit distance at the given position, or `None` if `pos` is
41 /// out of bounds.
42 fn dist_at(&self, pos: usize, states: &[State<T, D>]) -> Option<D>;
43
44 /// Initiates a `TracebackHandler` object to assist with a traceback, 'starting'
45 /// at the given end position.
46 /// Should return `None` if the alignment cannot be obtained
47 /// (can happen with block-based algorithm if not all blocks were computed)
48 fn init_traceback(
49 &self,
50 m: D,
51 pos: usize,
52 states: &'a [State<T, D>],
53 ) -> Option<Self::TracebackHandler>;
54}
55
56/// `TracebackHandler` objects assist with the computation of a traceback path.
57/// The trait is implemented both for the simple and the block-based algorithms.
58///
59/// Implementors must keep track of two cells in the DP matrix:
60/// C = "current" cell
61/// L = "left" cell in the diagonal position
62///
63/// |————————————
64/// | | | |
65/// |————————————
66/// | | L | |
67/// |————————————
68/// | | | C |
69/// |————————————
70///
71/// The idea behind this is, that diagonal moves in the DP matrix (= matches /
72/// substitutions) should be made as easy/fast as possible, while InDels
73/// are usually be less common (especially since there are no affine gap penalties)
74/// and upward/leftward moves can thus be more computationally intensive.
75///
76/// Implementors may store two `State<T, D>` blocks, whose `dist` field reflects
77/// the distance score of the current TB matrix cell, and along with that some auxiliary
78/// bit vectors indicating cursor position and assisting with adjustments to `dist` based
79/// deltas from the PV / MV bit vectors.
80pub(super) trait TracebackHandler<'a, T, D>
81where
82 T: BitVec + 'a,
83 D: DistType,
84{
85 /// Returns the distance score of the current cell C(i, j).
86 fn dist(&self) -> D;
87
88 /// Returns the distance score of the left cell, which is in the diagonal
89 /// position, C(i-1, j-1).
90 fn left_dist(&self) -> D;
91
92 /// Checks if the cell to the left C(i, j-1) has a smaller distance score than
93 /// the current cell C(i, j). If so, it calls `move_up()`.
94 fn try_move_up(&mut self) -> bool;
95
96 /// Moves up by one position in the DP matrix, adjusting the
97 /// distance scores/blocks/bit masks for the current and left columns.
98 fn move_up(&mut self);
99
100 /// Checks if the cell to the left C(i-1, j) has a smaller distance score than
101 /// the cell in the diagonal C(i-1, j-1). If so, adjusts the score/block
102 /// for the left column and returns `true`.
103 /// The "current" block needs no adjustment, as it will be discarded in
104 /// `finish_move_left()`, which needs to be called afterwards to complete
105 /// the move.
106 fn try_prepare_left(&mut self) -> bool;
107
108 /// Prepares for a diagonal move (which must be completed with `finish_move_left`).
109 /// Essentially, only bit masks and block indices need to be adjusted to
110 /// reflect the shift upwards by one position.
111 /// The blocks/distances themselves need no modification.
112 fn prepare_diagonal(&mut self);
113
114 /// Complete a 'move' to the left by
115 /// (1) replacing the current column with the left one (same block and score)
116 /// (2) adding a new column to the left and adjusting the distance score
117 /// to reflect the diagonal position.
118 fn finish_move_left(&mut self);
119
120 /// Returns true if topmost DP matrix row has been reached,
121 /// meaning that the complete alignment path has been found.
122 fn done(&self) -> bool;
123
124 /// For debugging only
125 #[allow(dead_code)]
126 fn print_state(&self);
127}
128
129#[derive(Clone, Debug)]
130pub(super) struct Traceback<'a, T, D, H>
131where
132 T: BitVec + 'a,
133 D: DistType,
134 H: StatesHandler<'a, T, D>,
135{
136 m: D,
137 positions: iter::Cycle<Range<usize>>,
138 handler: H,
139 pos: usize,
140 _t: PhantomData<&'a T>,
141}
142
143impl<'a, T, D, H> Traceback<'a, T, D, H>
144where
145 T: BitVec,
146 D: DistType,
147 H: StatesHandler<'a, T, D>,
148{
149 /// Creates a new `Traceback` instance.
150 /// The `states` vector is potentially reused, so all `State` instances
151 /// inside it must be initialized to new values.
152 #[inline]
153 pub fn new(
154 states: &mut Vec<State<T, D>>,
155 initial_state: &H::TracebackColumn,
156 num_cols: usize,
157 m: D,
158 mut handler: H,
159 ) -> Self {
160 // Correct traceback needs two additional columns at the left of the matrix (see below).
161 // Therefore reserving additional space.
162 let num_cols = num_cols + 2;
163
164 let n_states = handler.init(num_cols, m);
165
166 let mut tb = Traceback {
167 m,
168 positions: (0..num_cols).cycle(),
169 handler,
170 pos: 0,
171 _t: PhantomData,
172 };
173
174 // extend or truncate states vector
175 states.resize_with(n_states, Default::default);
176
177 // first column is used to ensure a correct path if the text (target)
178 // is shorter than the pattern (query)
179 tb.pos = tb.positions.next().unwrap();
180 tb.handler.set_max_state(tb.pos, states);
181
182 // initial state
183 tb.add_state(initial_state, states);
184
185 tb
186 }
187
188 #[inline]
189 pub fn add_state(&mut self, column: &H::TracebackColumn, states: &mut [State<T, D>]) {
190 self.pos = self.positions.next().unwrap();
191 self.handler.add_state(column, self.pos, states);
192 }
193
194 /// Returns distance for the given end position, or `None` if not stored
195 #[inline]
196 pub fn dist_at(&self, pos: usize, states: &'a [State<T, D>]) -> Option<D> {
197 let pos = pos + 2; // in order to be comparable since self.pos starts at 2, not 0
198 if pos <= self.pos {
199 return self.handler.dist_at(pos, states).map(|d| d as D);
200 }
201 None
202 }
203
204 /// Returns the length of the current match, optionally adding the
205 /// alignment path to `ops`
206 #[inline]
207 pub fn traceback(
208 &self,
209 ops: Option<&mut Vec<AlignmentOperation>>,
210 states: &'a [State<T, D>],
211 ) -> Option<(D, D)> {
212 self._traceback_at(self.pos, ops, states)
213 }
214
215 /// Returns the length of a match with a given end position, optionally adding the
216 /// alignment path to `ops`
217 /// only to be called if the `states` vec contains all states of the text
218 #[inline]
219 pub fn traceback_at(
220 &self,
221 pos: usize,
222 ops: Option<&mut Vec<AlignmentOperation>>,
223 states: &'a [State<T, D>],
224 ) -> Option<(D, D)> {
225 let pos = pos + 2; // in order to be comparable since self.pos starts at 2, not 0
226 if pos <= self.pos {
227 return self._traceback_at(pos, ops, states);
228 }
229 None
230 }
231
232 /// Returns a tuple of alignment length and hit distance, optionally adding the alignment path
233 /// to `ops`
234 #[inline]
235 fn _traceback_at(
236 &self,
237 pos: usize,
238 mut ops: Option<&mut Vec<AlignmentOperation>>,
239 state_slice: &'a [State<T, D>],
240 ) -> Option<(D, D)> {
241 use self::AlignmentOperation::*;
242
243 // self.print_tb_matrix(pos, state_slice);
244
245 // Generic object that holds the necessary data and methods
246 let mut h = self.handler.init_traceback(self.m, pos, state_slice)?;
247
248 // horizontal column offset from starting point in traceback matrix (bottom right)
249 let mut h_offset = D::zero();
250
251 // distance of the match (will be returned)
252 let dist = h.dist();
253
254 // Loop for finding the traceback path
255 while !h.done() {
256 let op;
257 // This loop is used to allow skipping `finish_move_left()` using break
258 // Like this, we avoid having to inline finish_move_left() three times.
259 #[allow(clippy::never_loop)]
260 loop {
261 // h.print_state();
262
263 // If there are several possible solutions, substitutions are preferred over InDels
264 // (Subst > Ins > Del)
265 if h.left_dist().wrapping_add(&D::one()) == h.dist() {
266 // Diagonal (substitution), move up
267 // Since the left cursor is always in the upper diagonal position,
268 // a simple comparison of distances is enough to determine substitutions.
269 h.prepare_diagonal();
270 op = Subst;
271 // then, move to left (below)
272 } else if h.try_move_up() {
273 // Up (insertion to target = deletion in query pattern)
274 op = Ins;
275 break;
276 } else if h.try_prepare_left() {
277 // Left (deletion in target text = insertion to query pattern)
278 op = Del;
279 // then, move to left (below)
280 } else {
281 // Diagonal (match), move up
282 debug_assert!(h.left_dist() == h.dist());
283 h.prepare_diagonal();
284 op = Match;
285 // then, move to left (below)
286 }
287
288 // // This is (probably) equivalent to the Edlib strategy (Ins > Del > Subst)
289 // // Tests succeed with this strategy as well, and performance is similar.
290 // if h.try_move_up() {
291 // op = Ins;
292 // break;
293 // } else if h.left_dist() == h.dist() && h.try_prepare_left() {
294 // op = Del;
295 // } else {
296 // h.prepare_diagonal();
297 // op = if h.left_dist().wrapping_add(&D::one()) == h.dist() {
298 // Subst
299 // } else {
300 // Match
301 // };
302 // }
303
304 // Moving one column to the left, adjusting h_offset
305 h_offset += D::one();
306 h.finish_move_left();
307 break;
308 }
309
310 // dbg!(op);
311
312 if let Some(o) = ops.as_mut() {
313 o.push(op);
314 }
315 }
316
317 Some((h_offset, dist))
318 }
319
320 // Useful for debugging
321 #[allow(dead_code)]
322 fn print_tb_matrix(&self, pos: usize, state_slice: &'a [State<T, D>]) {
323 let n_blocks = self.handler.n_blocks();
324 let pos = n_blocks * (pos + 1);
325 let states_iter = state_slice[..pos]
326 .chunks(n_blocks)
327 .rev()
328 .chain(state_slice.chunks(n_blocks).rev().cycle());
329 let m = self.m.to_usize().unwrap();
330 let mut out = vec![];
331 for col in states_iter {
332 let mut col_out = Vec::with_capacity(m);
333 let mut empty = true;
334 for (i, block) in col.iter().enumerate().rev() {
335 if !(block.is_new() || block.is_max()) {
336 empty = false;
337 }
338 let w = word_size::<T>();
339 let end = (i + 1) * w;
340 let _m = if end <= m { w } else { m % w };
341 let mut _block = *block;
342 let mut pos_mask = T::one() << (_m - 1);
343 col_out.push(_block.dist);
344 for _ in 0.._m {
345 // println!("{}\n{:064b}", _block, pos_mask);
346 _block.adjust_one_up(pos_mask);
347 pos_mask >>= 1;
348 col_out.push(_block.dist);
349 }
350 }
351 out.push(col_out);
352 if empty {
353 break;
354 }
355 }
356
357 for j in (0..m).rev() {
358 eprint!("{:>4}: ", m - j + 1);
359 for col in out.iter().rev() {
360 if let Some(d) = col.get(j) {
361 if *d >= (D::max_value() >> 1) {
362 // missing value
363 eprint!(" ");
364 } else {
365 eprint!("{:>4?}", d);
366 }
367 } else {
368 eprint!(" -");
369 }
370 }
371 eprintln!();
372 }
373 }
374}