block_aligner/
scan_block.rs

1//! Main Block Aligner algorithm and supporting data structures.
2
3#[cfg(feature = "simd_sse2")]
4use crate::sse2::*;
5
6#[cfg(feature = "simd_avx2")]
7use crate::avx2::*;
8
9#[cfg(feature = "simd_wasm")]
10use crate::simd128::*;
11
12#[cfg(feature = "simd_neon")]
13use crate::neon::*;
14
15use crate::scores::*;
16use crate::cigar::*;
17
18use std::{cmp, ptr, i16, alloc};
19use std::ops::RangeInclusive;
20
21#[cfg(feature = "mca")]
22use std::arch::asm;
23
24// Notes:
25//
26// R means row, C means column (typically stands for the DP tables)
27//
28// BLOSUM62 matrix max = 11, min = -4; gap open = -11 (includes extension), gap extend = -1
29//
30// Dynamic programming formula:
31// R[i][j] = max(R[i - 1][j] + gap_extend, D[i - 1][j] + gap_open)
32// C[i][j] = max(C[i][j - 1] + gap_extend, D[i][j - 1] + gap_open)
33// D[i][j] = max(D[i - 1][j - 1] + matrix[query[i]][reference[j]], R[i][j], C[i][j])
34//
35// indexing (we want to calculate D11):
36//      x0   x1
37//    +--------
38// 0x | 00   01
39// 1x | 10   11
40//
41// note that 'x' represents any bit
42//
43// The term "block" gets used for two different things (unfortunately):
44//
45// 1. A square region of the DP matrix that shifts, grows, and shrinks.
46// This is helpful for conceptually visualizing the algorithm.
47//
48// 2. A rectangular region representing only cells in the DP matrix that are calculated
49// due to shifting or growing. Since the step size is smaller than the block size, the
50// square blocks overlap. Only the non-overlapping new cells (a rectangular region) are
51// computed in each step. This usage applies for the "block" in the "place_block" function
52// (this is also sometimes known as the "compute rect" function).
53
54/// Keeps track of internal state and some parameters for block aligner.
55///
56/// This does not describe the whole state. The allocated scratch spaces
57/// and other local variables are also needed.
58struct State<'a, M: Matrix> {
59    query: &'a PaddedBytes,
60    i: usize,
61    reference: &'a PaddedBytes,
62    j: usize,
63    min_size: usize,
64    max_size: usize,
65    matrix: &'a M,
66    gaps: Gaps,
67    x_drop: i32
68}
69
70/// Keeps track of internal state and some parameters for block aligner for
71/// sequence to profile alignment.
72///
73/// This does not describe the whole state. The allocated scratch spaces
74/// and other local variables are also needed.
75struct StateProfile<'a, P: Profile> {
76    query: &'a PaddedBytes,
77    i: usize,
78    reference: &'a P,
79    j: usize,
80    min_size: usize,
81    max_size: usize,
82    x_drop: i32
83}
84
85/// Data structure storing the settings for Block Aligner.
86///
87/// A diagram showing different ways Block Aligner can be used:
88/// ![Block Aligner modes](https://raw.githubusercontent.com/Daniel-Liu-c0deb0t/block-aligner/main/block_aligner_modes.png)
89pub struct Block<const TRACE: bool, const X_DROP: bool = false, const LOCAL_START: bool = false, const FREE_QUERY_START_GAPS: bool = false, const FREE_QUERY_END_GAPS: bool = false> {
90    res: AlignResult,
91    allocated: Allocated
92}
93
94macro_rules! align_core_gen {
95    ($fn_name:ident, $matrix_or_profile:tt, $state:tt, $place_block_right_fn:path, $place_block_down_fn:path) => {
96        #[cfg_attr(feature = "simd_sse2", target_feature(enable = "sse2"))]
97        #[cfg_attr(feature = "simd_avx2", target_feature(enable = "avx2"))]
98        #[cfg_attr(feature = "simd_wasm", target_feature(enable = "simd128"))]
99        #[cfg_attr(feature = "simd_neon", target_feature(enable = "neon"))]
100        #[allow(non_snake_case)]
101        unsafe fn $fn_name<M: $matrix_or_profile>(&mut self, mut state: $state<M>) {
102            // store the best alignment ending location for x drop alignment
103            let mut best_max = 0i32;
104            let mut best_argmax_i = 0usize;
105            let mut best_argmax_j = 0usize;
106
107            let mut prev_dir = Direction::Grow;
108            let mut dir = Direction::Grow;
109            let mut prev_size = 0;
110            let mut block_size = state.min_size;
111
112            // 32-bit score offsets
113            let mut off = 0i32;
114            let mut prev_off;
115            let mut off_max = 0i32;
116
117            // how many steps since the latest best score was encountered
118            let mut y_drop_iter = 0;
119
120            // how many steps where the X-drop threshold is met
121            let mut x_drop_iter = 0;
122
123            let mut i_ckpt = state.i;
124            let mut j_ckpt = state.j;
125            let mut off_ckpt = 0i32;
126
127            // corner value that affects the score when shifting down then right, or right then down
128            let mut D_corner = simd_set1_i16(MIN);
129
130            loop {
131                #[cfg(feature = "debug")]
132                {
133                    println!("i: {}", state.i);
134                    println!("j: {}", state.j);
135                    println!("{:?}", dir);
136                    println!("block size: {}", block_size);
137                }
138
139                prev_off = off;
140
141                // grow_D_max is an auxiliary value used when growing because it requires two separate
142                // place_block steps
143                let mut grow_D_max = simd_set1_i16(MIN);
144                let mut grow_D_argmax_i = simd_set1_i16(0);
145                let mut grow_D_argmax_j = simd_set1_i16(0);
146                let (D_max, D_argmax_i, D_argmax_j, mut right_max, mut down_max) = match dir {
147                    Direction::Right => {
148                        off = off_max;
149                        #[cfg(feature = "debug")]
150                        println!("off: {}", off);
151                        let off_add = simd_set1_i16(clamp(prev_off - off));
152
153                        if TRACE {
154                            self.allocated.trace.add_block(state.i, state.j + block_size - STEP, STEP, block_size, true);
155                        }
156
157                        // offset previous columns with newly computed offset
158                        Self::just_offset(block_size, self.allocated.D_col.as_mut_ptr(), self.allocated.C_col.as_mut_ptr(), off_add);
159
160                        // compute new elements in the block as a result of shifting by the step size
161                        // this region should be block_size x step
162                        let (D_max, D_argmax_i, D_argmax_j) = $place_block_right_fn(
163                            &state,
164                            state.query,
165                            state.reference,
166                            &mut self.allocated.trace,
167                            state.i,
168                            state.j + block_size - STEP,
169                            STEP,
170                            block_size,
171                            self.allocated.D_col.as_mut_ptr(),
172                            self.allocated.C_col.as_mut_ptr(),
173                            self.allocated.temp_buf1.as_mut_ptr(),
174                            self.allocated.temp_buf2.as_mut_ptr(),
175                            if prev_dir == Direction::Down { simd_adds_i16(D_corner, off_add) } else { simd_set1_i16(MIN) },
176                            clamp(-off + (ZERO as i32)),
177                            true
178                        );
179
180                        // sum of a couple elements on the right border
181                        let right_max = Self::prefix_max(self.allocated.D_col.as_ptr());
182
183                        // shift and offset bottom row
184                        D_corner = Self::shift_and_offset(
185                            block_size,
186                            self.allocated.D_row.as_mut_ptr(),
187                            self.allocated.R_row.as_mut_ptr(),
188                            self.allocated.temp_buf1.as_mut_ptr(),
189                            self.allocated.temp_buf2.as_mut_ptr(),
190                            off_add
191                        );
192                        // sum of a couple elements on the bottom border
193                        let down_max = Self::prefix_max(self.allocated.D_row.as_ptr());
194
195                        (D_max, D_argmax_i, D_argmax_j, right_max, down_max)
196                    },
197                    Direction::Down => {
198                        off = off_max;
199                        #[cfg(feature = "debug")]
200                        println!("off: {}", off);
201                        let off_add = simd_set1_i16(clamp(prev_off - off));
202
203                        if TRACE {
204                            self.allocated.trace.add_block(state.i + block_size - STEP, state.j, block_size, STEP, false);
205                        }
206
207                        // offset previous rows with newly computed offset
208                        Self::just_offset(block_size, self.allocated.D_row.as_mut_ptr(), self.allocated.R_row.as_mut_ptr(), off_add);
209
210                        // compute new elements in the block as a result of shifting by the step size
211                        // this region should be step x block_size
212                        let (D_max, D_argmax_i, D_argmax_j) = $place_block_down_fn(
213                            &state,
214                            state.reference,
215                            state.query,
216                            &mut self.allocated.trace,
217                            state.j,
218                            state.i + block_size - STEP,
219                            STEP,
220                            block_size,
221                            self.allocated.D_row.as_mut_ptr(),
222                            self.allocated.R_row.as_mut_ptr(),
223                            self.allocated.temp_buf1.as_mut_ptr(),
224                            self.allocated.temp_buf2.as_mut_ptr(),
225                            if prev_dir == Direction::Right { simd_adds_i16(D_corner, off_add) } else { simd_set1_i16(MIN) },
226                            clamp(-off + (ZERO as i32)),
227                            false
228                        );
229
230                        // sum of a couple elements on the bottom border
231                        let down_max = Self::prefix_max(self.allocated.D_row.as_ptr());
232
233                        // shift and offset last column
234                        D_corner = Self::shift_and_offset(
235                            block_size,
236                            self.allocated.D_col.as_mut_ptr(),
237                            self.allocated.C_col.as_mut_ptr(),
238                            self.allocated.temp_buf1.as_mut_ptr(),
239                            self.allocated.temp_buf2.as_mut_ptr(),
240                            off_add
241                        );
242                        // sum of a couple elements on the right border
243                        let right_max = Self::prefix_max(self.allocated.D_col.as_ptr());
244
245                        (D_max, D_argmax_i, D_argmax_j, right_max, down_max)
246                    },
247                    Direction::Grow => {
248                        D_corner = simd_set1_i16(MIN);
249                        let grow_step = block_size - prev_size;
250
251                        #[cfg(feature = "debug")]
252                        println!("off: {}", off);
253                        #[cfg(feature = "debug")]
254                        println!("Grow down");
255
256                        if TRACE {
257                            self.allocated.trace.add_block(state.i + prev_size, state.j, prev_size, grow_step, false);
258                        }
259
260                        // down
261                        // this region should be prev_size x prev_size
262                        let (D_max1, D_argmax_i1, D_argmax_j1) = $place_block_down_fn(
263                            &state,
264                            state.reference,
265                            state.query,
266                            &mut self.allocated.trace,
267                            state.j,
268                            state.i + prev_size,
269                            grow_step,
270                            prev_size,
271                            self.allocated.D_row.as_mut_ptr(),
272                            self.allocated.R_row.as_mut_ptr(),
273                            self.allocated.D_col.as_mut_ptr().add(prev_size),
274                            self.allocated.C_col.as_mut_ptr().add(prev_size),
275                            simd_set1_i16(MIN),
276                            clamp(-off + (ZERO as i32)),
277                            false
278                        );
279
280                        #[cfg(feature = "debug")]
281                        println!("Grow right");
282
283                        if TRACE {
284                            self.allocated.trace.add_block(state.i, state.j + prev_size, grow_step, block_size, true);
285                        }
286
287                        // right
288                        // this region should be block_size x prev_size
289                        let (D_max2, D_argmax_i2, D_argmax_j2) = $place_block_right_fn(
290                            &state,
291                            state.query,
292                            state.reference,
293                            &mut self.allocated.trace,
294                            state.i,
295                            state.j + prev_size,
296                            grow_step,
297                            block_size,
298                            self.allocated.D_col.as_mut_ptr(),
299                            self.allocated.C_col.as_mut_ptr(),
300                            self.allocated.D_row.as_mut_ptr().add(prev_size),
301                            self.allocated.R_row.as_mut_ptr().add(prev_size),
302                            simd_set1_i16(MIN),
303                            clamp(-off + (ZERO as i32)),
304                            true
305                        );
306
307                        let right_max = Self::prefix_max(self.allocated.D_col.as_ptr());
308                        let down_max = Self::prefix_max(self.allocated.D_row.as_ptr());
309                        grow_D_max = D_max1;
310                        grow_D_argmax_i = D_argmax_i1;
311                        grow_D_argmax_j = D_argmax_j1;
312
313                        // must update the checkpoint saved values just in case
314                        // the block must grow again from this position
315                        let mut i = 0;
316                        while i < block_size {
317                            self.allocated.D_col_ckpt.set_vec(&self.allocated.D_col, i);
318                            self.allocated.C_col_ckpt.set_vec(&self.allocated.C_col, i);
319                            self.allocated.D_row_ckpt.set_vec(&self.allocated.D_row, i);
320                            self.allocated.R_row_ckpt.set_vec(&self.allocated.R_row, i);
321                            i += L;
322                        }
323
324                        if TRACE {
325                            self.allocated.trace.save_ckpt();
326                        }
327
328                        (D_max2, D_argmax_i2, D_argmax_j2, right_max, down_max)
329                    }
330                };
331
332                prev_dir = dir;
333                let D_max_max = if FREE_QUERY_END_GAPS {
334                    // can assume only the right region is computed when growing,
335                    // since the min block size is greater than the query length
336                    simd_slow_extract_i16(D_max, state.query.len() % L)
337                } else {
338                    simd_hmax_i16(D_max)
339                };
340                let grow_max = simd_hmax_i16(grow_D_max);
341                // max score of the entire block
342                // note that other than off_max and best_max, the other maxs are relative to the
343                // offsets off and ZERO
344                let max = cmp::max(D_max_max, grow_max);
345                off_max = off + (max as i32) - (ZERO as i32);
346                #[cfg(feature = "debug")]
347                println!("down max: {}, right max: {}", down_max, right_max);
348
349                y_drop_iter += 1;
350                // if block grows but the best score does not improve, then the block must grow again
351                let mut grow_no_max = dir == Direction::Grow;
352
353                if off_max > best_max {
354                    if FREE_QUERY_END_GAPS {
355                        // can assume either growing (right region only) or shifting right, so
356                        // can assume state.i == 0
357                        let idx_j = simd_slow_extract_i16(D_argmax_j, state.query.len() % L) as usize;
358                        best_argmax_i = state.query.len();
359                        match dir {
360                            Direction::Right => {
361                                best_argmax_j = state.j + (block_size - STEP) + idx_j;
362                            },
363                            Direction::Grow => {
364                                best_argmax_j = state.j + prev_size + idx_j;
365                            },
366                            _ => unreachable!(),
367                        }
368                    }
369
370                    if X_DROP {
371                        // TODO: move outside loop
372                        // calculate location with the best score
373                        let lane_idx = simd_hargmax_i16(D_max, D_max_max);
374                        let idx_i = simd_slow_extract_i16(D_argmax_i, lane_idx) as usize;
375                        let idx_j = simd_slow_extract_i16(D_argmax_j, lane_idx) as usize;
376                        let r = idx_i + lane_idx;
377                        let c = (block_size - STEP) + idx_j;
378
379                        match dir {
380                            Direction::Right => {
381                                best_argmax_i = state.i + r;
382                                best_argmax_j = state.j + c;
383                            },
384                            Direction::Down => {
385                                best_argmax_i = state.i + c;
386                                best_argmax_j = state.j + r;
387                            },
388                            Direction::Grow => {
389                                // max could be in either block
390                                if D_max_max >= grow_max {
391                                    // grow right
392                                    best_argmax_i = state.i + idx_i + lane_idx;
393                                    best_argmax_j = state.j + prev_size + idx_j;
394                                } else {
395                                    // grow down
396                                    let lane_idx = simd_hargmax_i16(grow_D_max, grow_max);
397                                    let idx_i = simd_slow_extract_i16(grow_D_argmax_i, lane_idx) as usize;
398                                    let idx_j = simd_slow_extract_i16(grow_D_argmax_j, lane_idx) as usize;
399                                    best_argmax_i = state.i + prev_size + idx_j;
400                                    best_argmax_j = state.j + idx_i + lane_idx;
401                                }
402                            }
403                        }
404                    }
405
406                    if block_size < state.max_size {
407                        // if able to grow in the future, then save the current location
408                        // as a checkpoint
409                        i_ckpt = state.i;
410                        j_ckpt = state.j;
411                        off_ckpt = off;
412
413                        let mut i = 0;
414                        while i < block_size {
415                            self.allocated.D_col_ckpt.set_vec(&self.allocated.D_col, i);
416                            self.allocated.C_col_ckpt.set_vec(&self.allocated.C_col, i);
417                            self.allocated.D_row_ckpt.set_vec(&self.allocated.D_row, i);
418                            self.allocated.R_row_ckpt.set_vec(&self.allocated.R_row, i);
419                            i += L;
420                        }
421
422                        if TRACE {
423                            self.allocated.trace.save_ckpt();
424                        }
425
426                        grow_no_max = false;
427                    }
428
429                    best_max = off_max;
430
431                    y_drop_iter = 0;
432                }
433
434                if X_DROP {
435                    if off_max < best_max - state.x_drop {
436                        if x_drop_iter < X_DROP_ITER - 1 {
437                            x_drop_iter += 1;
438                        } else {
439                            // x drop termination
440                            break;
441                        }
442                    } else {
443                        x_drop_iter = 0;
444                    }
445                }
446
447                if state.i + block_size > state.query.len() && state.j + block_size > state.reference.len() {
448                    // reached the end of the strings
449                    break;
450                }
451
452                // first check if the shift direction is "forced" to avoid going out of bounds
453                if state.j + block_size > state.reference.len() {
454                    state.i += STEP;
455                    dir = Direction::Down;
456                    continue;
457                }
458                if state.i + block_size > state.query.len() {
459                    state.j += STEP;
460                    dir = Direction::Right;
461                    continue;
462                }
463
464                // three decisions are made below (based on heuristics):
465                // * whether to grow
466                // * whether to shrink
467                // * whether to shift right or down
468                // TODO: better heuristics?
469
470                // check if it is possible to grow
471                let next_size = block_size * 2;
472                if next_size <= state.max_size {
473                    // if approximately (block_size / step) iterations has passed since the last best
474                    // max, then it is time to grow
475                    if y_drop_iter > (block_size / STEP) - 1 || grow_no_max {
476                        // y drop grow block
477                        prev_size = block_size;
478                        block_size = next_size;
479                        dir = Direction::Grow;
480
481                        // return to checkpoint
482                        state.i = i_ckpt;
483                        state.j = j_ckpt;
484                        off = off_ckpt;
485
486                        let mut i = 0;
487                        while i < prev_size {
488                            self.allocated.D_col.set_vec(&self.allocated.D_col_ckpt, i);
489                            self.allocated.C_col.set_vec(&self.allocated.C_col_ckpt, i);
490                            self.allocated.D_row.set_vec(&self.allocated.D_row_ckpt, i);
491                            self.allocated.R_row.set_vec(&self.allocated.R_row_ckpt, i);
492                            i += L;
493                        }
494
495                        if TRACE {
496                            self.allocated.trace.restore_ckpt();
497                        }
498
499                        y_drop_iter = 0;
500                        continue;
501                    }
502                }
503
504                // check if it is possible to shrink
505                if SHRINK && block_size > state.min_size && y_drop_iter == 0 {
506                    let shrink_max = cmp::max(
507                        Self::suffix_max(self.allocated.D_row.as_ptr(), block_size),
508                        Self::suffix_max(self.allocated.D_col.as_ptr(), block_size)
509                    );
510                    if shrink_max >= max {
511                        // just to make sure it is not right or down shift so D_corner is not used
512                        prev_dir = Direction::Grow;
513
514                        block_size /= 2;
515                        let mut i = 0;
516                        while i < block_size {
517                            self.allocated.D_col.copy_vec(i, i + block_size);
518                            self.allocated.C_col.copy_vec(i, i + block_size);
519                            self.allocated.D_row.copy_vec(i, i + block_size);
520                            self.allocated.R_row.copy_vec(i, i + block_size);
521                            i += L;
522                        }
523
524                        state.i += block_size;
525                        state.j += block_size;
526
527                        i_ckpt = state.i;
528                        j_ckpt = state.j;
529                        off_ckpt = off;
530
531                        let mut i = 0;
532                        while i < block_size {
533                            self.allocated.D_col_ckpt.set_vec(&self.allocated.D_col, i);
534                            self.allocated.C_col_ckpt.set_vec(&self.allocated.C_col, i);
535                            self.allocated.D_row_ckpt.set_vec(&self.allocated.D_row, i);
536                            self.allocated.R_row_ckpt.set_vec(&self.allocated.R_row, i);
537                            i += L;
538                        }
539
540                        right_max = Self::prefix_max(self.allocated.D_col.as_ptr());
541                        down_max = Self::prefix_max(self.allocated.D_row.as_ptr());
542
543                        if TRACE {
544                            self.allocated.trace.save_ckpt();
545                        }
546
547                        y_drop_iter = 0;
548                    }
549                }
550
551                // move according to where the max is
552                if down_max > right_max {
553                    state.i += STEP;
554                    dir = Direction::Down;
555                } else {
556                    state.j += STEP;
557                    dir = Direction::Right;
558                }
559            }
560
561            #[cfg(any(feature = "debug", feature = "debug_size"))]
562            {
563                println!("query size: {}, reference size: {}", state.query.len(), state.reference.len());
564                println!("end block size: {}", block_size);
565            }
566
567            self.res = if X_DROP || FREE_QUERY_END_GAPS {
568                AlignResult {
569                    score: best_max,
570                    query_idx: best_argmax_i,
571                    reference_idx: best_argmax_j
572                }
573            } else {
574                debug_assert!(state.i <= state.query.len());
575                let score = off + match dir {
576                    Direction::Right | Direction::Grow => {
577                        let idx = state.query.len() - state.i;
578                        debug_assert!(idx < block_size);
579                        (self.allocated.D_col.get(idx) as i32) - (ZERO as i32)
580                    },
581                    Direction::Down => {
582                        let idx = state.reference.len() - state.j;
583                        debug_assert!(idx < block_size);
584                        (self.allocated.D_row.get(idx) as i32) - (ZERO as i32)
585                    }
586                };
587                AlignResult {
588                    score,
589                    query_idx: state.query.len(),
590                    reference_idx: state.reference.len()
591                }
592            };
593        }
594    };
595}
596
597/// Place block right or down for sequence-profile alignment.
598///
599/// Although conceptually blocks are squares, this function is actually used to compute any
600/// rectangular region. For example, when shifting a block right by some step
601/// size, only the rectangular region with width = step size needs to be computed, since
602/// the new shifted block will partially overlap with the previous block.
603///
604/// Assumes all inputs are already relative to the current offset.
605///
606/// Inside this function, everything will be treated as shifting right,
607/// conceptually. The same process can be trivially used for shifting
608/// down by calling this function with different parameters.
609///
610/// Right and down shifts must be handled separately since a sequence
611/// is aligned to a profile.
612macro_rules! place_block_profile_gen {
613    ($fn_name:ident, $query: ident, $query_type: ty, $reference: ident, $reference_type: ty, $q: ident, $r: ident, $right: expr) => {
614        #[cfg_attr(feature = "simd_sse2", target_feature(enable = "sse2"))]
615        #[cfg_attr(feature = "simd_avx2", target_feature(enable = "avx2"))]
616        #[cfg_attr(feature = "simd_wasm", target_feature(enable = "simd128"))]
617        #[cfg_attr(feature = "simd_neon", target_feature(enable = "neon"))]
618        #[allow(non_snake_case)]
619        unsafe fn $fn_name<P: Profile>(_state: &StateProfile<P>,
620                                       $query: $query_type,
621                                       $reference: $reference_type,
622                                       trace: &mut Trace,
623                                       start_i: usize,
624                                       start_j: usize,
625                                       width: usize,
626                                       height: usize,
627                                       D_col: *mut i16,
628                                       C_col: *mut i16,
629                                       D_row: *mut i16,
630                                       R_row: *mut i16,
631                                       mut D_corner: Simd,
632                                       relative_zero: i16,
633                                       _right: bool) -> (Simd, Simd, Simd) {
634            let gap_extend = simd_set1_i16($r.get_gap_extend() as i16);
635            let (gap_extend_all, prefix_scan_consts) = get_prefix_scan_consts(gap_extend);
636            let mut D_max = simd_set1_i16(MIN);
637            let mut D_argmax_i = simd_set1_i16(0);
638            let mut D_argmax_j = simd_set1_i16(0);
639
640            let mut idx = 0;
641            let mut gap_open_C = simd_set1_i16(MIN);
642            let mut gap_close_C = simd_set1_i16(MIN);
643            let mut gap_open_R = simd_set1_i16(MIN);
644            let mut gap_close_R = simd_set1_i16(MIN);
645
646            if width == 0 || height == 0 {
647                return (D_max, D_argmax_i, D_argmax_j);
648            }
649
650            // hottest loop in the whole program
651            for j in 0..width {
652                let mut R01 = simd_set1_i16(MIN);
653                let mut D11 = simd_set1_i16(MIN);
654                let mut R11 = simd_set1_i16(MIN);
655                let mut prev_trace_R = simd_set1_i16(0);
656
657                if $right {
658                    idx = start_j + j;
659                    gap_open_C = $r.get_gap_open_right_C(idx);
660                    gap_close_C = $r.get_gap_close_right_C(idx);
661                    gap_open_R = $r.get_gap_open_right_R(idx);
662                }
663
664                let mut i = 0;
665                while i < height {
666                    let D10 = simd_load(D_col.add(i) as _);
667                    let C10 = simd_load(C_col.add(i) as _);
668                    let D00 = simd_sl_i16!(D10, D_corner, 1);
669                    D_corner = D10;
670
671                    if !$right {
672                        idx = start_i + i;
673                        gap_open_C = $r.get_gap_open_down_R(idx);
674                        gap_open_R = $r.get_gap_open_down_C(idx);
675                        gap_close_R = $r.get_gap_close_down_C(idx);
676                    }
677
678                    let scores = if $right {
679                        $r.get_scores_pos(idx, halfsimd_loadu($q.as_ptr(start_i + i) as _), true)
680                    } else {
681                        $r.get_scores_aa(idx, $q.get(start_j + j), false)
682                    };
683                    D11 = simd_adds_i16(D00, scores);
684                    if (!LOCAL_START && start_i + i == 0 && start_j + j == 0) || (FREE_QUERY_START_GAPS && $right && start_i + i == 0) {
685                        D11 = simd_insert_i16!(D11, relative_zero, 0);
686                    }
687
688                    if LOCAL_START {
689                        D11 = simd_max_i16(D11, simd_set1_i16(relative_zero));
690                    }
691
692                    let C11_open = simd_adds_i16(D10, simd_adds_i16(gap_open_C, gap_extend));
693                    let C11 = simd_max_i16(simd_adds_i16(C10, gap_extend), C11_open);
694                    let C11_end = if $right { simd_adds_i16(C11, gap_close_C) } else { C11 };
695                    D11 = simd_max_i16(D11, C11_end);
696                    // at this point, C11 is fully calculated and D11 is partially calculated
697
698                    let D11_open = simd_adds_i16(D11, gap_open_R);
699                    R11 = simd_prefix_scan_i16(D11_open, gap_extend, prefix_scan_consts);
700                    // do prefix scan before using R01 to break up dependency chain that depends on
701                    // the last element of R01 from the previous loop iteration
702                    R11 = simd_max_i16(R11, simd_adds_i16(simd_broadcasthi_i16(R01), gap_extend_all));
703                    // fully calculate D11 using R11
704                    let R11_end = if $right { R11 } else { simd_adds_i16(R11, gap_close_R) };
705                    D11 = simd_max_i16(D11, R11_end);
706                    R01 = R11;
707
708                    #[cfg(feature = "debug")]
709                    {
710                        print!("s:   ");
711                        simd_dbg_i16(scores);
712                        print!("D00: ");
713                        simd_dbg_i16(simd_subs_i16(D00, simd_set1_i16(ZERO)));
714                        print!("C11: ");
715                        simd_dbg_i16(simd_subs_i16(C11, simd_set1_i16(ZERO)));
716                        print!("R11: ");
717                        simd_dbg_i16(simd_subs_i16(R11, simd_set1_i16(ZERO)));
718                        print!("D11: ");
719                        simd_dbg_i16(simd_subs_i16(D11, simd_set1_i16(ZERO)));
720                    }
721
722                    if TRACE {
723                        let trace_D_C = simd_cmpeq_i16(D11, C11_end);
724                        let trace_D_R = simd_cmpeq_i16(D11, R11_end);
725                        #[cfg(feature = "debug")]
726                        {
727                            print!("D_C: ");
728                            simd_dbg_i16(trace_D_C);
729                            print!("D_R: ");
730                            simd_dbg_i16(trace_D_R);
731                        }
732                        // compress trace with movemask to save space
733                        let mask = simd_set1_i16(0xFF00u16 as i16);
734                        let trace_data = simd_movemask_i8(simd_blend_i8(trace_D_C, trace_D_R, mask));
735                        let temp_trace_R = simd_cmpeq_i16(R11, D11_open);
736                        let trace_R = simd_sl_i16!(temp_trace_R, prev_trace_R, 1);
737                        let trace_data2 = simd_movemask_i8(simd_blend_i8(simd_cmpeq_i16(C11, C11_open), trace_R, mask));
738                        prev_trace_R = temp_trace_R;
739
740                        if LOCAL_START {
741                            let zero_mask = simd_cmpeq_i16(D11, simd_set1_i16(relative_zero));
742                            trace.add_zero_mask(simd_movemask_i8(zero_mask) as TraceType);
743                        }
744
745                        trace.add_trace(trace_data as TraceType, trace_data2 as TraceType);
746                    }
747
748                    D_max = simd_max_i16(D_max, D11);
749
750                    if X_DROP || (FREE_QUERY_END_GAPS && start_i + i + L > $query.len()) {
751                        // keep track of the best score and its location
752                        // note: can assume right = true and only the last SIMD vectors are needed for FREE_QUERY_END_GAPS,
753                        // due to the limitation that the min block size must be greater than query length
754                        let mask = simd_cmpeq_i16(D_max, D11);
755                        D_argmax_i = simd_blend_i8(D_argmax_i, simd_set1_i16(i as i16), mask);
756                        D_argmax_j = simd_blend_i8(D_argmax_j, simd_set1_i16(j as i16), mask);
757                    }
758
759                    simd_store(D_col.add(i) as _, D11);
760                    simd_store(C_col.add(i) as _, C11);
761                    i += L;
762                }
763
764                D_corner = simd_set1_i16(MIN);
765
766                ptr::write(D_row.add(j), simd_extract_i16!(D11, L - 1));
767                ptr::write(R_row.add(j), simd_extract_i16!(R11, L - 1));
768
769                if !X_DROP && !FREE_QUERY_END_GAPS && start_i + height > $query.len()
770                    && start_j + j >= $reference.len() {
771                    if TRACE {
772                        // make sure that the trace index is updated since the rest of the loop
773                        // iterations are skipped
774                        trace.add_trace_idx((width - 1 - j) * (height / L));
775                    }
776                    break;
777                }
778            }
779
780            (D_max, D_argmax_i, D_argmax_j)
781        }
782    };
783}
784
785// increasing step size gives a bit extra speed but results in lower accuracy
786// current settings are fast, at the expense of some accuracy, and step size does not grow
787const STEP: usize = 8;
788const X_DROP_ITER: usize = 2; // make sure that the X-drop iteration is truly met instead of just one "bad" step
789const SHRINK: bool = true; // whether to allow the block size to shrink by powers of 2
790const SHRINK_SUFFIX_LEN: usize = STEP / 4;
791impl<const TRACE: bool, const X_DROP: bool, const LOCAL_START: bool, const FREE_QUERY_START_GAPS: bool, const FREE_QUERY_END_GAPS: bool> Block<{ TRACE }, { X_DROP }, { LOCAL_START }, { FREE_QUERY_START_GAPS }, { FREE_QUERY_END_GAPS }> {
792    /// Allocate a block aligner instance with an upper bound query length,
793    /// reference length, and max block size.
794    ///
795    /// A block aligner instance can be reused for multiple alignments as long
796    /// as the aligned sequence lengths and block sizes do not exceed the specified
797    /// upper bounds.
798    pub fn new(query_len: usize, reference_len: usize, max_size: usize) -> Self {
799        assert!(max_size.is_power_of_two(), "Block size must be a power of two!");
800
801        Self {
802            res: AlignResult { score: 0, query_idx: 0, reference_idx: 0 },
803            allocated: Allocated::new(query_len, reference_len, max_size, TRACE, LOCAL_START, FREE_QUERY_START_GAPS)
804        }
805    }
806
807    /// Align two sequences with block aligner.
808    ///
809    /// If `TRACE` is true, then information for computing the traceback will be stored.
810    /// After alignment, the traceback CIGAR string can then be computed.
811    /// This will slow down alignment and use a lot more memory.
812    ///
813    /// If `X_DROP` is true, then the alignment process will be terminated early when
814    /// the max score in the current block drops by `x_drop` below the max score encountered
815    /// so far. The location of the max score is stored in the alignment result.
816    /// This allows the alignment to end anywhere in the DP matrix.
817    /// If `X_DROP` is false, then global alignment is done.
818    ///
819    /// If `LOCAL_START` is true, then the alignment is allowed to start anywhere in the DP matrix.
820    /// Local alignment can be accomplished by setting `LOCAL_START` and `X_DROP` to true and `x_drop`
821    /// to a very large value.
822    ///
823    /// If `FREE_QUERY_START_GAPS` is true, then gaps before the start of the query are free.
824    ///
825    /// If `FREE_QUERY_END_GAPS` is true, then gaps after the end of the query are free.
826    /// Note that this has a limitation: the min block size must be greater than the length of the query.
827    ///
828    /// Since larger scores are better, gap and mismatches penalties must be negative.
829    ///
830    /// The minimum and maximum sizes of the block must be powers of 2 that are greater than the
831    /// number of 16-bit lanes in a SIMD vector.
832    ///
833    /// The block aligner algorithm will dynamically shift a block down or right and grow its size
834    /// to efficiently calculate the alignment between two strings.
835    /// This is fast, but it may be slightly less accurate than computing the entire the alignment
836    /// dynamic programming matrix. Growing the size of the block allows larger gaps and
837    /// other potentially difficult regions to be handled correctly.
838    /// The algorithm also allows shrinking the block size for greater efficiency when handling
839    /// regions in the sequences with no gaps.
840    /// 16-bit deltas and 32-bit offsets are used to ensure that accurate scores are
841    /// computed, even when the the strings are long.
842    ///
843    /// When aligning sequences `q` against `r`, this algorithm computes cells in the DP matrix
844    /// with `|q| + 1` rows and `|r| + 1` columns.
845    ///
846    /// X-drop alignment with `ByteMatrix` is not supported.
847    pub fn align<M: Matrix>(&mut self, query: &PaddedBytes, reference: &PaddedBytes, matrix: &M, gaps: Gaps, size: RangeInclusive<usize>, x_drop: i32) {
848        // check invariants so bad stuff doesn't happen later
849        assert!(gaps.open < 0 && gaps.extend < 0, "Gap costs must be negative!");
850        // there are edge cases with calculating traceback that doesn't work if
851        // gap open does not cost more than gap extend
852        assert!(gaps.open < gaps.extend, "Gap open must cost more than gap extend!");
853        let min_size = if *size.start() < L { L } else { *size.start() };
854        let max_size = if *size.end() < L { L } else { *size.end() };
855        assert!(min_size < (u16::MAX as usize) && max_size < (u16::MAX as usize), "Block sizes must be smaller than 2^16 - 1!");
856        assert!(min_size.is_power_of_two() && max_size.is_power_of_two(), "Block sizes must be powers of two!");
857        if X_DROP {
858            assert!(x_drop >= 0, "X-drop threshold amount must be nonnegative!");
859        }
860        assert!(!LOCAL_START || !FREE_QUERY_START_GAPS, "Cannot set both LOCAL_START and FREE_QUERY_START_GAPS!");
861        assert!(!X_DROP || !FREE_QUERY_END_GAPS, "Cannot set both X_DROP and FREE_QUERY_END_GAPS!");
862        assert!(!FREE_QUERY_END_GAPS || min_size > query.len(), "Min block size must be larger than the query length for FREE_QUERY_END_GAPS!");
863
864        unsafe { self.allocated.clear(query.len(), reference.len(), max_size, TRACE); }
865
866        let s = State {
867            query,
868            i: 0,
869            reference,
870            j: 0,
871            min_size,
872            max_size,
873            matrix,
874            gaps,
875            x_drop
876        };
877        unsafe { self.align_core(s); }
878    }
879
880    /// Align two sequences with exponential search on the min block size.
881    ///
882    /// This calls `align` multiple times, doubling the min block size in each iteration
883    /// until either the max block size is reached or the score reaches or exceeds the target score.
884    pub fn align_exp<M: Matrix>(&mut self, query: &PaddedBytes, reference: &PaddedBytes, matrix: &M, gaps: Gaps, size: RangeInclusive<usize>, x_drop: i32, target_score: i32) -> Option<usize> {
885        let mut min_size = if *size.start() < L { L } else { *size.start() };
886        let max_size = if *size.end() < L { L } else { *size.end() };
887        assert!(min_size < (u16::MAX as usize) && max_size < (u16::MAX as usize), "Block sizes must be smaller than 2^16 - 1!");
888        assert!(min_size.is_power_of_two() && max_size.is_power_of_two(), "Block sizes must be powers of two!");
889
890        while min_size <= max_size {
891            self.align(query, reference, matrix, gaps, min_size..=max_size, x_drop);
892            let curr_score = self.res().score;
893
894            if curr_score >= target_score {
895                return Some(min_size);
896            }
897
898            min_size *= 2;
899        }
900
901        None
902    }
903
904    /// Align a sequence to a profile with block aligner.
905    ///
906    /// If `TRACE` is true, then information for computing the traceback will be stored.
907    /// After alignment, the traceback CIGAR string can then be computed.
908    /// This will slow down alignment and use a lot more memory.
909    ///
910    /// If `X_DROP` is true, then the alignment process will be terminated early when
911    /// the max score in the current block drops by `x_drop` below the max score encountered
912    /// so far. The location of the max score is stored in the alignment result.
913    /// This allows the alignment to end anywhere in the DP matrix.
914    /// If `X_DROP` is false, then global alignment is done.
915    ///
916    /// If `LOCAL_START` is true, then the alignment is allowed to start anywhere in the DP matrix.
917    /// Local alignment can be accomplished by setting `LOCAL_START` and `X_DROP` to true and `x_drop`
918    /// to a very large value.
919    ///
920    /// If `FREE_QUERY_START_GAPS` is true, then gaps before the start of the query are free.
921    ///
922    /// If `FREE_QUERY_END_GAPS` is true, then gaps after the end of the query are free.
923    /// Note that this has a limitation: the min block size must be greater than the length of the query.
924    ///
925    /// Since larger scores are better, gap and mismatches penalties must be negative.
926    ///
927    /// The minimum and maximum sizes of the block must be powers of 2 that are greater than the
928    /// number of 16-bit lanes in a SIMD vector.
929    ///
930    /// The block aligner algorithm will dynamically shift a block down or right and grow its size
931    /// to efficiently calculate the alignment between two strings.
932    /// This is fast, but it may be slightly less accurate than computing the entire the alignment
933    /// dynamic programming matrix. Growing the size of the block allows larger gaps and
934    /// other potentially difficult regions to be handled correctly.
935    /// The algorithm also allows shrinking the block size for greater efficiency when handling
936    /// regions in the sequences with no gaps.
937    /// 16-bit deltas and 32-bit offsets are used to ensure that accurate scores are
938    /// computed, even when the the strings are long.
939    ///
940    /// When aligning sequence `q` against profile `p`, this algorithm computes cells in the DP matrix
941    /// with `|q| + 1` rows and `|p| + 1` columns.
942    pub fn align_profile<P: Profile>(&mut self, query: &PaddedBytes, profile: &P, size: RangeInclusive<usize>, x_drop: i32) {
943        // check invariants so bad stuff doesn't happen later
944        assert!(profile.get_gap_extend() < 0, "Gap extend cost must be negative!");
945        let min_size = if *size.start() < L { L } else { *size.start() };
946        let max_size = if *size.end() < L { L } else { *size.end() };
947        assert!(min_size < (u16::MAX as usize) && max_size < (u16::MAX as usize), "Block sizes must be smaller than 2^16 - 1!");
948        assert!(min_size.is_power_of_two() && max_size.is_power_of_two(), "Block sizes must be powers of two!");
949        if X_DROP {
950            assert!(x_drop >= 0, "X-drop threshold amount must be nonnegative!");
951        }
952        assert!(!LOCAL_START || !FREE_QUERY_START_GAPS, "Cannot set both LOCAL_START and FREE_QUERY_START_GAPS!");
953        assert!(!X_DROP || !FREE_QUERY_END_GAPS, "Cannot set both X_DROP and FREE_QUERY_END_GAPS!");
954        assert!(!FREE_QUERY_END_GAPS || min_size > query.len(), "Min block size must be larger than the query length for FREE_QUERY_END_GAPS!");
955
956        unsafe { self.allocated.clear(query.len(), profile.len(), max_size, TRACE); }
957
958        let s = StateProfile {
959            query,
960            i: 0,
961            reference: profile,
962            j: 0,
963            min_size,
964            max_size,
965            x_drop
966        };
967        unsafe { self.align_profile_core(s); }
968    }
969
970    /// Align a sequence to a profile with exponential search on the min block size.
971    ///
972    /// This calls `align_profile` multiple times, doubling the min block size in each iteration
973    /// until either the max block size is reached or the score reaches or exceeds the target score.
974    pub fn align_profile_exp<P: Profile>(&mut self, query: &PaddedBytes, profile: &P, size: RangeInclusive<usize>, x_drop: i32, target_score: i32) -> Option<usize> {
975        let mut min_size = if *size.start() < L { L } else { *size.start() };
976        let max_size = if *size.end() < L { L } else { *size.end() };
977        assert!(min_size < (u16::MAX as usize) && max_size < (u16::MAX as usize), "Block sizes must be smaller than 2^16 - 1!");
978        assert!(min_size.is_power_of_two() && max_size.is_power_of_two(), "Block sizes must be powers of two!");
979
980        while min_size <= max_size {
981            self.align_profile(query, profile, min_size..=max_size, x_drop);
982            let curr_score = self.res().score;
983
984            if curr_score >= target_score {
985                return Some(min_size);
986            }
987
988            min_size *= 2;
989        }
990
991        None
992    }
993
994    align_core_gen!(align_core, Matrix, State, Self::place_block, Self::place_block);
995    align_core_gen!(align_profile_core, Profile, StateProfile, Self::place_block_profile_right, Self::place_block_profile_down);
996
997    #[cfg_attr(feature = "simd_sse2", target_feature(enable = "sse2"))]
998    #[cfg_attr(feature = "simd_avx2", target_feature(enable = "avx2"))]
999    #[cfg_attr(feature = "simd_wasm", target_feature(enable = "simd128"))]
1000    #[cfg_attr(feature = "simd_neon", target_feature(enable = "neon"))]
1001    #[allow(non_snake_case)]
1002    #[inline]
1003    unsafe fn just_offset(block_size: usize, buf1: *mut i16, buf2: *mut i16, off_add: Simd) {
1004        let mut i = 0;
1005        while i < block_size {
1006            let a = simd_adds_i16(simd_load(buf1.add(i) as _), off_add);
1007            let b = simd_adds_i16(simd_load(buf2.add(i) as _), off_add);
1008            simd_store(buf1.add(i) as _, a);
1009            simd_store(buf2.add(i) as _, b);
1010            i += L;
1011        }
1012    }
1013
1014    #[cfg_attr(feature = "simd_sse2", target_feature(enable = "sse2"))]
1015    #[cfg_attr(feature = "simd_avx2", target_feature(enable = "avx2"))]
1016    #[cfg_attr(feature = "simd_wasm", target_feature(enable = "simd128"))]
1017    #[cfg_attr(feature = "simd_neon", target_feature(enable = "neon"))]
1018    #[allow(non_snake_case)]
1019    #[inline]
1020    unsafe fn prefix_max(buf: *const i16) -> i16 {
1021        simd_prefix_hmax_i16!(simd_load(buf as _), STEP)
1022    }
1023
1024    #[cfg_attr(feature = "simd_sse2", target_feature(enable = "sse2"))]
1025    #[cfg_attr(feature = "simd_avx2", target_feature(enable = "avx2"))]
1026    #[cfg_attr(feature = "simd_wasm", target_feature(enable = "simd128"))]
1027    #[cfg_attr(feature = "simd_neon", target_feature(enable = "neon"))]
1028    #[allow(non_snake_case)]
1029    #[inline]
1030    unsafe fn suffix_max(buf: *const i16, buf_len: usize) -> i16 {
1031        simd_suffix_hmax_i16!(simd_load(buf.add(buf_len - L) as _), SHRINK_SUFFIX_LEN)
1032    }
1033
1034    #[cfg_attr(feature = "simd_sse2", target_feature(enable = "sse2"))]
1035    #[cfg_attr(feature = "simd_avx2", target_feature(enable = "avx2"))]
1036    #[cfg_attr(feature = "simd_wasm", target_feature(enable = "simd128"))]
1037    #[cfg_attr(feature = "simd_neon", target_feature(enable = "neon"))]
1038    #[allow(non_snake_case)]
1039    #[inline]
1040    unsafe fn shift_and_offset(block_size: usize, buf1: *mut i16, buf2: *mut i16, temp_buf1: *mut i16, temp_buf2: *mut i16, off_add: Simd) -> Simd {
1041        let mut curr1 = simd_adds_i16(simd_load(buf1 as _), off_add);
1042        let D_corner = simd_set1_i16(simd_extract_i16!(curr1, STEP - 1));
1043        let mut curr2 = simd_adds_i16(simd_load(buf2 as _), off_add);
1044
1045        let mut i = 0;
1046        while i < block_size - L {
1047            let next1 = simd_adds_i16(simd_load(buf1.add(i + L) as _), off_add);
1048            let next2 = simd_adds_i16(simd_load(buf2.add(i + L) as _), off_add);
1049            simd_store(buf1.add(i) as _, simd_step(next1, curr1));
1050            simd_store(buf2.add(i) as _, simd_step(next2, curr2));
1051            curr1 = next1;
1052            curr2 = next2;
1053            i += L;
1054        }
1055
1056        let next1 = simd_load(temp_buf1 as _);
1057        let next2 = simd_load(temp_buf2 as _);
1058        simd_store(buf1.add(block_size - L) as _, simd_step(next1, curr1));
1059        simd_store(buf2.add(block_size - L) as _, simd_step(next2, curr2));
1060        D_corner
1061    }
1062
1063    /// Place block right or down for sequence-sequence alignment.
1064    ///
1065    /// Although conceptually blocks are squares, this function is actually used to compute any
1066    /// rectangular region. For example, when shifting a block right by some step
1067    /// size, only the rectangular region with width = step size needs to be computed, since
1068    /// the new shifted block will partially overlap with the previous block.
1069    ///
1070    /// Assumes all inputs are already relative to the current offset.
1071    ///
1072    /// Inside this function, everything will be treated as shifting right,
1073    /// conceptually. The same process can be trivially used for shifting
1074    /// down by calling this function with different parameters.
1075    ///
1076    /// The same function can be reused for right and down shifts because
1077    /// sequence to sequence alignment is symmetric.
1078    #[cfg_attr(feature = "simd_sse2", target_feature(enable = "sse2"))]
1079    #[cfg_attr(feature = "simd_avx2", target_feature(enable = "avx2"))]
1080    #[cfg_attr(feature = "simd_wasm", target_feature(enable = "simd128"))]
1081    #[cfg_attr(feature = "simd_neon", target_feature(enable = "neon"))]
1082    #[allow(non_snake_case)]
1083    unsafe fn place_block<M: Matrix>(state: &State<M>,
1084                                     query: &PaddedBytes,
1085                                     reference: &PaddedBytes,
1086                                     trace: &mut Trace,
1087                                     start_i: usize,
1088                                     start_j: usize,
1089                                     width: usize,
1090                                     height: usize,
1091                                     D_col: *mut i16,
1092                                     C_col: *mut i16,
1093                                     D_row: *mut i16,
1094                                     R_row: *mut i16,
1095                                     mut D_corner: Simd,
1096                                     relative_zero: i16,
1097                                     right: bool) -> (Simd, Simd, Simd) {
1098        let gap_open = simd_set1_i16(state.gaps.open as i16);
1099        let gap_extend = simd_set1_i16(state.gaps.extend as i16);
1100        let (gap_extend_all, prefix_scan_consts) = get_prefix_scan_consts(gap_extend);
1101        let mut D_max = simd_set1_i16(MIN);
1102        let mut D_argmax_i = simd_set1_i16(0);
1103        let mut D_argmax_j = simd_set1_i16(0);
1104
1105        if width == 0 || height == 0 {
1106            return (D_max, D_argmax_i, D_argmax_j);
1107        }
1108
1109        // hottest loop in the whole program
1110        for j in 0..width {
1111            let mut R01 = simd_set1_i16(MIN);
1112            let mut D11 = simd_set1_i16(MIN);
1113            let mut R11 = simd_set1_i16(MIN);
1114            let mut prev_trace_R = simd_set1_i16(0);
1115
1116            let c = reference.get(start_j + j);
1117
1118            let mut i = 0;
1119            while i < height {
1120                #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "mca"))]
1121                asm!("# LLVM-MCA-BEGIN place_block inner loop", options(nomem, nostack, preserves_flags));
1122
1123                let D10 = simd_load(D_col.add(i) as _);
1124                let C10 = simd_load(C_col.add(i) as _);
1125                let D00 = simd_sl_i16!(D10, D_corner, 1);
1126                D_corner = D10;
1127
1128                let scores = state.matrix.get_scores(c, halfsimd_loadu(query.as_ptr(start_i + i) as _), right);
1129                D11 = simd_adds_i16(D00, scores);
1130                if (!LOCAL_START && start_i + i == 0 && start_j + j == 0) || (FREE_QUERY_START_GAPS && right && start_i + i == 0) {
1131                    D11 = simd_insert_i16!(D11, relative_zero, 0);
1132                }
1133
1134                if LOCAL_START {
1135                    D11 = simd_max_i16(D11, simd_set1_i16(relative_zero));
1136                }
1137
1138                let C11_open = simd_adds_i16(D10, gap_open);
1139                let C11 = simd_max_i16(simd_adds_i16(C10, gap_extend), C11_open);
1140                D11 = simd_max_i16(D11, C11);
1141                // at this point, C11 is fully calculated and D11 is partially calculated
1142
1143                let D11_open = simd_adds_i16(D11, simd_subs_i16(gap_open, gap_extend));
1144                R11 = simd_prefix_scan_i16(D11_open, gap_extend, prefix_scan_consts);
1145                // do prefix scan before using R01 to break up dependency chain that depends on
1146                // the last element of R01 from the previous loop iteration
1147                R11 = simd_max_i16(R11, simd_adds_i16(simd_broadcasthi_i16(R01), gap_extend_all));
1148                // fully calculate D11 using R11
1149                D11 = simd_max_i16(D11, R11);
1150                R01 = R11;
1151
1152                #[cfg(feature = "debug")]
1153                {
1154                    print!("s:   ");
1155                    simd_dbg_i16(scores);
1156                    print!("D00: ");
1157                    simd_dbg_i16(simd_subs_i16(D00, simd_set1_i16(ZERO)));
1158                    print!("C11: ");
1159                    simd_dbg_i16(simd_subs_i16(C11, simd_set1_i16(ZERO)));
1160                    print!("R11: ");
1161                    simd_dbg_i16(simd_subs_i16(R11, simd_set1_i16(ZERO)));
1162                    print!("D11: ");
1163                    simd_dbg_i16(simd_subs_i16(D11, simd_set1_i16(ZERO)));
1164                }
1165
1166                if TRACE {
1167                    let trace_D_C = simd_cmpeq_i16(D11, C11);
1168                    let trace_D_R = simd_cmpeq_i16(D11, R11);
1169                    #[cfg(feature = "debug")]
1170                    {
1171                        print!("D_C: ");
1172                        simd_dbg_i16(trace_D_C);
1173                        print!("D_R: ");
1174                        simd_dbg_i16(trace_D_R);
1175                    }
1176                    // compress trace with movemask to save space
1177                    let mask = simd_set1_i16(0xFF00u16 as i16);
1178                    let trace_data = simd_movemask_i8(simd_blend_i8(trace_D_C, trace_D_R, mask));
1179                    let temp_trace_R = simd_cmpeq_i16(R11, D11_open);
1180                    let trace_R = simd_sl_i16!(temp_trace_R, prev_trace_R, 1);
1181                    let trace_data2 = simd_movemask_i8(simd_blend_i8(simd_cmpeq_i16(C11, C11_open), trace_R, mask));
1182                    prev_trace_R = temp_trace_R;
1183
1184                    if LOCAL_START {
1185                        let zero_mask = simd_cmpeq_i16(D11, simd_set1_i16(relative_zero));
1186                        trace.add_zero_mask(simd_movemask_i8(zero_mask) as TraceType);
1187                    }
1188
1189                    trace.add_trace(trace_data as TraceType, trace_data2 as TraceType);
1190                }
1191
1192                D_max = simd_max_i16(D_max, D11);
1193
1194                if X_DROP || (FREE_QUERY_END_GAPS && start_i + i + L > query.len()) {
1195                    // keep track of the best score and its location
1196                    // note: can assume right = true and only the last SIMD vectors are needed for FREE_QUERY_END_GAPS,
1197                    // due to the limitation that the min block size must be greater than query length
1198                    let mask = simd_cmpeq_i16(D_max, D11);
1199                    D_argmax_i = simd_blend_i8(D_argmax_i, simd_set1_i16(i as i16), mask);
1200                    D_argmax_j = simd_blend_i8(D_argmax_j, simd_set1_i16(j as i16), mask);
1201                }
1202
1203                simd_store(D_col.add(i) as _, D11);
1204                simd_store(C_col.add(i) as _, C11);
1205                i += L;
1206
1207                #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "mca"))]
1208                asm!("# LLVM-MCA-END", options(nomem, nostack, preserves_flags));
1209            }
1210
1211            D_corner = simd_set1_i16(MIN);
1212
1213            ptr::write(D_row.add(j), simd_extract_i16!(D11, L - 1));
1214            ptr::write(R_row.add(j), simd_extract_i16!(R11, L - 1));
1215
1216            if !X_DROP && !FREE_QUERY_END_GAPS && start_i + height > query.len()
1217                && start_j + j >= reference.len() {
1218                if TRACE {
1219                    // make sure that the trace index is updated since the rest of the loop
1220                    // iterations are skipped
1221                    trace.add_trace_idx((width - 1 - j) * (height / L));
1222                }
1223                break;
1224            }
1225        }
1226
1227        (D_max, D_argmax_i, D_argmax_j)
1228    }
1229
1230    place_block_profile_gen!(place_block_profile_right, query, &PaddedBytes, reference, &P, query, reference, true);
1231    place_block_profile_gen!(place_block_profile_down, reference, &P, query, &PaddedBytes, query, reference, false);
1232
1233    /// Get the resulting score and ending location of the alignment.
1234    #[inline]
1235    pub fn res(&self) -> AlignResult {
1236        self.res
1237    }
1238
1239    /// Get the trace of the alignment, assuming `TRACE` is true.
1240    #[inline]
1241    pub fn trace(&self) -> &Trace {
1242        assert!(TRACE);
1243        &self.allocated.trace
1244    }
1245}
1246
1247/// Allocated scratch spaces for alignment.
1248///
1249/// Scratch spaces can be reused for aligning strings with shorter lengths
1250/// and smaller block sizes.
1251#[allow(non_snake_case)]
1252struct Allocated {
1253    pub trace: Trace,
1254
1255    // bottom and right borders of the current block
1256    pub D_col: Aligned,
1257    pub C_col: Aligned,
1258    pub D_row: Aligned,
1259    pub R_row: Aligned,
1260
1261    // the state at the previous checkpoint (where latest best score was encountered)
1262    pub D_col_ckpt: Aligned,
1263    pub C_col_ckpt: Aligned,
1264    pub D_row_ckpt: Aligned,
1265    pub R_row_ckpt: Aligned,
1266
1267    // reused buffers for storing values that must be shifted
1268    // into the other border when the block moves in one direction
1269    pub temp_buf1: Aligned,
1270    pub temp_buf2: Aligned,
1271
1272    query_len: usize,
1273    reference_len: usize,
1274    max_size: usize,
1275    trace_flag: bool
1276}
1277
1278impl Allocated {
1279    #[allow(non_snake_case)]
1280    fn new(query_len: usize, reference_len: usize, max_size: usize, trace_flag: bool, local_start: bool, free_query_start_gaps: bool) -> Self {
1281        unsafe {
1282            let trace = if trace_flag {
1283                Trace::new(query_len, reference_len, max_size, local_start, free_query_start_gaps)
1284            } else {
1285                Trace::new(0, 0, 0, false, false)
1286            };
1287            let D_col = Aligned::new(max_size);
1288            let C_col = Aligned::new(max_size);
1289            let D_row = Aligned::new(max_size);
1290            let R_row = Aligned::new(max_size);
1291            let D_col_ckpt = Aligned::new(max_size);
1292            let C_col_ckpt = Aligned::new(max_size);
1293            let D_row_ckpt = Aligned::new(max_size);
1294            let R_row_ckpt = Aligned::new(max_size);
1295            let temp_buf1 = Aligned::new(L);
1296            let temp_buf2 = Aligned::new(L);
1297
1298            Self {
1299                trace,
1300                D_col,
1301                C_col,
1302                D_row,
1303                R_row,
1304                D_col_ckpt,
1305                C_col_ckpt,
1306                D_row_ckpt,
1307                R_row_ckpt,
1308                temp_buf1,
1309                temp_buf2,
1310                query_len,
1311                reference_len,
1312                max_size,
1313                trace_flag
1314            }
1315        }
1316    }
1317
1318    #[cfg_attr(feature = "simd_sse2", target_feature(enable = "sse2"))]
1319    #[cfg_attr(feature = "simd_avx2", target_feature(enable = "avx2"))]
1320    #[cfg_attr(feature = "simd_wasm", target_feature(enable = "simd128"))]
1321    #[cfg_attr(feature = "simd_neon", target_feature(enable = "neon"))]
1322    unsafe fn clear(&mut self, query_len: usize, reference_len: usize, max_size: usize, trace_flag: bool) {
1323        // do not overwrite query_len, reference_len, etc. because they are upper bounds
1324        assert!(query_len + reference_len <= self.query_len + self.reference_len);
1325        assert!(max_size <= self.max_size);
1326        assert_eq!(trace_flag, self.trace_flag);
1327
1328        self.trace.clear(query_len, reference_len);
1329        self.D_col.clear(max_size);
1330        self.C_col.clear(max_size);
1331        self.D_row.clear(max_size);
1332        self.R_row.clear(max_size);
1333        self.D_col_ckpt.clear(max_size);
1334        self.C_col_ckpt.clear(max_size);
1335        self.D_row_ckpt.clear(max_size);
1336        self.R_row_ckpt.clear(max_size);
1337        self.temp_buf1.clear(L);
1338        self.temp_buf2.clear(L);
1339    }
1340}
1341
1342/// Holds the trace generated by block aligner.
1343#[derive(Clone)]
1344pub struct Trace {
1345    trace: Vec<TraceType>,
1346    trace2: Vec<TraceType>,
1347    right: Vec<u64>,
1348    block_start: Vec<u32>,
1349    block_size: Vec<u16>,
1350    zero_mask: Vec<TraceType>,
1351    trace_idx: usize,
1352    block_idx: usize,
1353    ckpt_trace_idx: usize,
1354    ckpt_block_idx: usize,
1355    query_len: usize,
1356    reference_len: usize,
1357    local_start: bool,
1358    free_query_start_gaps: bool
1359}
1360
1361impl Trace {
1362    #[inline]
1363    fn new(query_len: usize, reference_len: usize, max_size: usize, local_start: bool, free_query_start_gaps: bool) -> Self {
1364        let len = query_len + reference_len + 2;
1365        let trace = vec![0 as TraceType; (max_size / L) * (len + max_size * 2)];
1366        let trace2 = vec![0 as TraceType; (max_size / L) * (len + max_size * 2)];
1367        let right = vec![0u64; div_ceil(len, 64)];
1368        let block_start = vec![0u32; len * 2];
1369        let block_size = vec![0u16; len * 2];
1370        let zero_mask = if local_start {
1371            vec![0 as TraceType; (max_size / L) * (len + max_size * 2)]
1372        } else {
1373            vec![]
1374        };
1375
1376        Self {
1377            trace,
1378            trace2,
1379            right,
1380            block_start,
1381            block_size,
1382            zero_mask,
1383            trace_idx: 0,
1384            block_idx: 0,
1385            ckpt_trace_idx: 0,
1386            ckpt_block_idx: 0,
1387            query_len,
1388            reference_len,
1389            local_start,
1390            free_query_start_gaps,
1391        }
1392    }
1393
1394    #[inline]
1395    fn clear(&mut self, query_len: usize, reference_len: usize) {
1396        // no need to clear trace, block_start, and block_size
1397        self.right.fill(0);
1398        self.trace_idx = 0;
1399        self.block_idx = 0;
1400        self.ckpt_trace_idx = 0;
1401        self.ckpt_block_idx = 0;
1402        self.query_len = query_len;
1403        self.reference_len = reference_len;
1404    }
1405
1406    #[cfg_attr(feature = "simd_sse2", target_feature(enable = "sse2"))]
1407    #[cfg_attr(feature = "simd_avx2", target_feature(enable = "avx2"))]
1408    #[cfg_attr(feature = "simd_wasm", target_feature(enable = "simd128"))]
1409    #[cfg_attr(feature = "simd_neon", target_feature(enable = "neon"))]
1410    #[inline]
1411    unsafe fn add_trace(&mut self, t: TraceType, t2: TraceType) {
1412        debug_assert!(self.trace_idx < self.trace.len());
1413        store_trace(self.trace.as_mut_ptr().add(self.trace_idx), t);
1414        store_trace(self.trace2.as_mut_ptr().add(self.trace_idx), t2);
1415        self.trace_idx += 1;
1416    }
1417
1418    #[cfg_attr(feature = "simd_sse2", target_feature(enable = "sse2"))]
1419    #[cfg_attr(feature = "simd_avx2", target_feature(enable = "avx2"))]
1420    #[cfg_attr(feature = "simd_wasm", target_feature(enable = "simd128"))]
1421    #[cfg_attr(feature = "simd_neon", target_feature(enable = "neon"))]
1422    #[inline]
1423    unsafe fn add_zero_mask(&mut self, mask: TraceType) {
1424        store_trace(self.zero_mask.as_mut_ptr().add(self.trace_idx), mask);
1425    }
1426
1427    #[inline]
1428    fn add_block(&mut self, i: usize, j: usize, width: usize, height: usize, right: bool) {
1429        debug_assert!(self.block_idx * 2 < self.block_start.len());
1430        unsafe {
1431            *self.block_start.as_mut_ptr().add(self.block_idx * 2) = i as u32;
1432            *self.block_start.as_mut_ptr().add(self.block_idx * 2 + 1) = j as u32;
1433            *self.block_size.as_mut_ptr().add(self.block_idx * 2) = height as u16;
1434            *self.block_size.as_mut_ptr().add(self.block_idx * 2 + 1) = width as u16;
1435
1436            let a = self.block_idx / 64;
1437            let b = self.block_idx % 64;
1438            let v = *self.right.as_ptr().add(a) & !(1 << b); // clear bit
1439            *self.right.as_mut_ptr().add(a) = v | ((right as u64) << b);
1440
1441            self.block_idx += 1;
1442        }
1443    }
1444
1445    #[inline]
1446    fn add_trace_idx(&mut self, add: usize) {
1447        self.trace_idx += add;
1448    }
1449
1450    #[inline]
1451    fn save_ckpt(&mut self) {
1452        self.ckpt_trace_idx = self.trace_idx;
1453        self.ckpt_block_idx = self.block_idx;
1454    }
1455
1456    /// The trace data structure is like a stack, so all trace values and blocks after the
1457    /// checkpoint is essentially popped off the stack.
1458    #[inline]
1459    fn restore_ckpt(&mut self) {
1460        self.trace_idx = self.ckpt_trace_idx;
1461        self.block_idx = self.ckpt_block_idx;
1462    }
1463
1464    /// Create a CIGAR string that represents a single traceback path ending on the specified
1465    /// location.
1466    ///
1467    /// When aligning `q` against `r`, this represents the edits to go from `r` to `q`.
1468    /// Matches and mismatches are both represented with `M`.
1469    pub fn cigar(&self, i: usize, j: usize, cigar: &mut Cigar) {
1470        self.cigar_core::<false>(i, j, None, None, cigar);
1471    }
1472
1473    /// Create a CIGAR string that represents a single traceback path ending on the specified
1474    /// location.
1475    ///
1476    /// When aligning `q` against `r`, this represents the edits to go from `r` to `q`.
1477    /// Matches are represented using `=` and mismatches are represented using `X`.
1478    pub fn cigar_eq(&self, query: &PaddedBytes, reference: &PaddedBytes, i: usize, j: usize, cigar: &mut Cigar) {
1479        self.cigar_core::<true>(i, j, Some(query), Some(reference), cigar);
1480    }
1481
1482    fn cigar_core<const EQ: bool>(&self, mut i: usize, mut j: usize, q: Option<&PaddedBytes>, r: Option<&PaddedBytes>, cigar: &mut Cigar) {
1483        assert!(i <= self.query_len && j <= self.reference_len, "Traceback cigar end position must be in bounds!");
1484        if EQ {
1485            assert!(q.is_some() && r.is_some());
1486        }
1487
1488        cigar.clear(i, j);
1489
1490        unsafe {
1491            let mut block_idx = self.block_idx;
1492            let mut trace_idx = self.trace_idx;
1493            let mut block_i;
1494            let mut block_j;
1495            let mut block_width;
1496            let mut block_height;
1497            let mut right;
1498
1499            #[derive(Copy, Clone, PartialEq, Debug)]
1500            enum Table {
1501                D = 0b00,
1502                C = 0b01,
1503                R = 0b10
1504            }
1505
1506            // use lookup table instead of hard to predict branches
1507            // constructed at compile time!
1508            static OP_LUT: [[(Operation, usize, usize, Table); 64]; 2] = {
1509                let mut lut = [[(Operation::D, 0, 1, Table::D); 64]; 2];
1510
1511                // table: the current DP table, D, C, or R (tables are standardized to right = true, C and R would be swapped for right = false)
1512                // trace: 2 bits, first bit is whether the max equals C table entry, second bit is
1513                // whether the max equals R table entry (vice versa for right = false)
1514                // trace2: 2 bits, first bit is whether the max in the C table is the gap beginning, second
1515                // bit is whether the max in the R table is the gap beginning (vice versa for right = false)
1516                // right: whether the current block contains vectors laid out vertically
1517
1518                let mut right = 0;
1519                while right < 2 {
1520                    let mut trace = 0;
1521                    while trace < 4 {
1522                        let mut trace2 = 0;
1523                        while trace2 < 4 {
1524                            let mut table_idx = 0;
1525                            while table_idx < 3 {
1526                                let table = match table_idx {
1527                                    0b00 => Table::D,
1528                                    0b01 => Table::C,
1529                                    _ => Table::R
1530                                };
1531
1532                                let res = if right == 1 {
1533                                    match (trace, trace2, table) {
1534                                        (_, 0b00 | 0b10, Table::C) => (Operation::D, 0, 1, Table::C), // C table gap extend
1535                                        (_, 0b01 | 0b11, Table::C) => (Operation::D, 0, 1, Table::D), // C table gap open
1536                                        (_, 0b00 | 0b01, Table::R) => (Operation::I, 1, 0, Table::R), // R table gap extend
1537                                        (_, 0b10 | 0b11, Table::R) => (Operation::I, 1, 0, Table::D), // R table gap open
1538                                        (0b00, _, Table::D) => (Operation::M, 1, 1, Table::D), // D table match/mismatch
1539                                        (0b01 | 0b11, 0b00 | 0b10, Table::D) => (Operation::D, 0, 1, Table::C), // D table C gap extend
1540                                        (0b01 | 0b11, 0b01 | 0b11, Table::D) => (Operation::D, 0, 1, Table::D), // D table C gap open
1541                                        (0b10, 0b00 | 0b01, Table::D) => (Operation::I, 1, 0, Table::R), // D table R gap extend
1542                                        (0b10, 0b10 | 0b11, Table::D) => (Operation::I, 1, 0, Table::D), // D table R gap open
1543                                        _ => (Operation::D, 0, 1, Table::D)
1544                                    }
1545                                } else {
1546                                    // everything is basically swapped (C/R and I/D) for down (right = false)
1547                                    match (trace, trace2, table) {
1548                                        (_, 0b00 | 0b10, Table::R) => (Operation::I, 1, 0, Table::R), // R table gap extend
1549                                        (_, 0b01 | 0b11, Table::R) => (Operation::I, 1, 0, Table::D), // R table gap open
1550                                        (_, 0b00 | 0b01, Table::C) => (Operation::D, 0, 1, Table::C), // C table gap extend
1551                                        (_, 0b10 | 0b11, Table::C) => (Operation::D, 0, 1, Table::D), // C table gap open
1552                                        (0b00, _, Table::D) => (Operation::M, 1, 1, Table::D), // D table match/mismatch
1553                                        (0b01 | 0b11, 0b00 | 0b10, Table::D) => (Operation::I, 1, 0, Table::R), // D table R gap extend
1554                                        (0b01 | 0b11, 0b01 | 0b11, Table::D) => (Operation::I, 1, 0, Table::D), // D table R gap open
1555                                        (0b10, 0b00 | 0b01, Table::D) => (Operation::D, 0, 1, Table::C), // D table C gap extend
1556                                        (0b10, 0b10 | 0b11, Table::D) => (Operation::D, 0, 1, Table::D), // D table C gap open
1557                                        _ => (Operation::I, 1, 0, Table::D)
1558                                    }
1559                                };
1560
1561                                lut[right][(trace << 4) | (trace2 << 2) | (table as usize)] = res;
1562                                table_idx += 1;
1563                            }
1564                            trace2 += 1;
1565                        }
1566                        trace += 1;
1567                    }
1568                    right += 1;
1569                }
1570
1571                lut
1572            };
1573
1574            let mut table = Table::D;
1575
1576            'outer: while i > 0 || j > 0 {
1577                // find the current block that contains (i, j)
1578                loop {
1579                    block_idx -= 1;
1580                    block_i = *self.block_start.as_ptr().add(block_idx * 2) as usize;
1581                    block_j = *self.block_start.as_ptr().add(block_idx * 2 + 1) as usize;
1582                    block_height = *self.block_size.as_ptr().add(block_idx * 2) as usize;
1583                    block_width = *self.block_size.as_ptr().add(block_idx * 2 + 1) as usize;
1584                    trace_idx -= block_width * block_height / L;
1585
1586                    if i >= block_i && j >= block_j {
1587                        right = ((*self.right.as_ptr().add(block_idx / 64) >> (block_idx % 64)) & 0b1) as usize;
1588                        break;
1589                    }
1590                }
1591
1592                // compute traceback within the current block
1593                let lut = &*OP_LUT.as_ptr().add(right);
1594                if right > 0 {
1595                    // right block
1596                    while i >= block_i && j >= block_j && (i > 0 || j > 0) {
1597                        if self.free_query_start_gaps && i == 0 {
1598                            // can do this because the row (i == 0) must be within right blocks
1599                            break 'outer;
1600                        }
1601
1602                        let curr_i = i - block_i;
1603                        let curr_j = j - block_j;
1604                        let idx = trace_idx + curr_i / L + curr_j * (block_height / L);
1605
1606                        if self.local_start && table == Table::D {
1607                            // terminate alignment on zero
1608                            let zero = ((*self.zero_mask.as_ptr().add(idx) >> ((curr_i % L) * 2)) & 0b1) > 0;
1609                            if zero {
1610                                break 'outer;
1611                            }
1612                        }
1613
1614                        // build the index into the lookup table
1615                        let t = ((*self.trace.as_ptr().add(idx) >> ((curr_i % L) * 2)) & 0b11) as usize;
1616                        let t2 = ((*self.trace2.as_ptr().add(idx) >> ((curr_i % L) * 2)) & 0b11) as usize;
1617                        let lut_idx = (t << 4) | (t2 << 2) | (table as usize);
1618                        let lut_entry = &*lut.as_ptr().add(lut_idx);
1619
1620                        let op = if EQ && lut_entry.0 == Operation::M {
1621                            if q.unwrap_unchecked().get(i) == r.unwrap_unchecked().get(j) {
1622                                Operation::Eq
1623                            } else {
1624                                Operation::X
1625                            }
1626                        } else {
1627                            lut_entry.0
1628                        };
1629                        i -= lut_entry.1;
1630                        j -= lut_entry.2;
1631                        table = lut_entry.3;
1632                        cigar.add(op);
1633                    }
1634                } else {
1635                    // down block
1636                    while i >= block_i && j >= block_j && (i > 0 || j > 0) {
1637                        let curr_i = i - block_i;
1638                        let curr_j = j - block_j;
1639                        let idx = trace_idx + curr_j / L + curr_i * (block_width / L);
1640
1641                        if self.local_start && table == Table::D {
1642                            // terminate alignment on zero
1643                            let zero = ((*self.zero_mask.as_ptr().add(idx) >> ((curr_j % L) * 2)) & 0b1) > 0;
1644                            if zero {
1645                                break 'outer;
1646                            }
1647                        }
1648
1649                        // build the index into the lookup table
1650                        let t = ((*self.trace.as_ptr().add(idx) >> ((curr_j % L) * 2)) & 0b11) as usize;
1651                        let t2 = ((*self.trace2.as_ptr().add(idx) >> ((curr_j % L) * 2)) & 0b11) as usize;
1652                        let lut_idx = (t << 4) | (t2 << 2) | (table as usize);
1653                        let lut_entry = &*lut.as_ptr().add(lut_idx);
1654
1655                        let op = if EQ && lut_entry.0 == Operation::M {
1656                            if q.unwrap_unchecked().get(i) == r.unwrap_unchecked().get(j) {
1657                                Operation::Eq
1658                            } else {
1659                                Operation::X
1660                            }
1661                        } else {
1662                            lut_entry.0
1663                        };
1664                        i -= lut_entry.1;
1665                        j -= lut_entry.2;
1666                        table = lut_entry.3;
1667                        cigar.add(op);
1668                    }
1669                }
1670            }
1671        }
1672    }
1673
1674    /// Return all of the rectangular regions that were calculated separately as
1675    /// block aligner shifts and grows.
1676    pub fn blocks(&self) -> Vec<Rectangle> {
1677        let mut res = Vec::with_capacity(self.block_idx);
1678
1679        for i in 0..self.block_idx {
1680            unsafe {
1681                res.push(Rectangle {
1682                    row: *self.block_start.as_ptr().add(i * 2) as usize,
1683                    col: *self.block_start.as_ptr().add(i * 2 + 1) as usize,
1684                    height: *self.block_size.as_ptr().add(i * 2) as usize,
1685                    width: *self.block_size.as_ptr().add(i * 2 + 1) as usize
1686                });
1687            }
1688        }
1689
1690        res
1691    }
1692}
1693
1694/// A rectangular region.
1695#[derive(Copy, Clone, PartialEq, Debug)]
1696pub struct Rectangle {
1697    pub row: usize,
1698    pub col: usize,
1699    pub width: usize,
1700    pub height: usize
1701}
1702
1703#[inline]
1704fn clamp(x: i32) -> i16 {
1705    cmp::min(cmp::max(x, i16::MIN as i32), i16::MAX as i32) as i16
1706}
1707
1708#[inline]
1709fn div_ceil(n: usize, d: usize) -> usize {
1710    (n + d - 1) / d
1711}
1712
1713/// Same alignment as SIMD vectors.
1714struct Aligned {
1715    layout: alloc::Layout,
1716    ptr: *const i16
1717}
1718
1719impl Aligned {
1720    pub unsafe fn new(block_size: usize) -> Self {
1721        // custom alignment
1722        let layout = alloc::Layout::from_size_align_unchecked(block_size * 2, L_BYTES);
1723        let ptr = alloc::alloc_zeroed(layout) as *const i16;
1724        Self { layout, ptr }
1725    }
1726
1727    #[cfg_attr(feature = "simd_sse2", target_feature(enable = "sse2"))]
1728    #[cfg_attr(feature = "simd_avx2", target_feature(enable = "avx2"))]
1729    #[cfg_attr(feature = "simd_wasm", target_feature(enable = "simd128"))]
1730    #[cfg_attr(feature = "simd_neon", target_feature(enable = "neon"))]
1731    pub unsafe fn clear(&mut self, block_size: usize) {
1732        let mut i = 0;
1733        while i < block_size {
1734            simd_store(self.ptr.add(i) as _, simd_set1_i16(MIN));
1735            i += L;
1736        }
1737    }
1738
1739    #[cfg_attr(feature = "simd_sse2", target_feature(enable = "sse2"))]
1740    #[cfg_attr(feature = "simd_avx2", target_feature(enable = "avx2"))]
1741    #[cfg_attr(feature = "simd_wasm", target_feature(enable = "simd128"))]
1742    #[cfg_attr(feature = "simd_neon", target_feature(enable = "neon"))]
1743    #[inline]
1744    pub unsafe fn set_vec(&mut self, o: &Aligned, idx: usize) {
1745        simd_store(self.ptr.add(idx) as _, simd_load(o.as_ptr().add(idx) as _));
1746    }
1747
1748    #[cfg_attr(feature = "simd_sse2", target_feature(enable = "sse2"))]
1749    #[cfg_attr(feature = "simd_avx2", target_feature(enable = "avx2"))]
1750    #[cfg_attr(feature = "simd_wasm", target_feature(enable = "simd128"))]
1751    #[cfg_attr(feature = "simd_neon", target_feature(enable = "neon"))]
1752    #[inline]
1753    pub unsafe fn copy_vec(&mut self, new_idx: usize, idx: usize) {
1754        simd_store(self.ptr.add(new_idx) as _, simd_load(self.ptr.add(idx) as _));
1755    }
1756
1757    #[inline]
1758    pub fn get(&self, i: usize) -> i16 {
1759        unsafe { *self.ptr.add(i) }
1760    }
1761
1762    #[allow(dead_code)]
1763    #[inline]
1764    pub fn set(&mut self, i: usize, v: i16) {
1765        unsafe { ptr::write(self.ptr.add(i) as _, v); }
1766    }
1767
1768    #[inline]
1769    pub fn as_mut_ptr(&mut self) -> *mut i16 {
1770        self.ptr as _
1771    }
1772
1773    #[inline]
1774    pub fn as_ptr(&self) -> *const i16 {
1775        self.ptr
1776    }
1777}
1778
1779impl Drop for Aligned {
1780    fn drop(&mut self) {
1781        unsafe { alloc::dealloc(self.ptr as _, self.layout); }
1782    }
1783}
1784
1785/// A padded string that helps avoid out of bounds access when using SIMD.
1786///
1787/// A single padding byte in inserted before the start of the string,
1788/// and `block_size` bytes are inserted after the end of the string.
1789#[derive(Clone, PartialEq, Debug)]
1790pub struct PaddedBytes {
1791    s: Vec<u8>,
1792    len: usize
1793}
1794
1795impl PaddedBytes {
1796    /// Create an empty `PaddedBytes` instance that can hold byte strings
1797    /// of a specific size.
1798    pub fn new<M: Matrix>(len: usize, block_size: usize) -> Self {
1799        Self {
1800            s: vec![M::convert_char(M::NULL); 1 + len + block_size],
1801            len
1802        }
1803    }
1804
1805    /// Modifies the bytes in place, filling in the rest of the memory with padding bytes.
1806    pub fn set_bytes<M: Matrix>(&mut self, b: &[u8], block_size: usize) {
1807        self.s[0] = M::convert_char(M::NULL);
1808        self.s[1..1 + b.len()].copy_from_slice(b);
1809        self.s[1..1 + b.len()].iter_mut().for_each(|c| *c = M::convert_char(*c));
1810        self.s[1 + b.len()..1 + b.len() + block_size].fill(M::convert_char(M::NULL));
1811        self.len = b.len();
1812    }
1813
1814    /// Modifies the bytes in place in reverse, filling in the rest of the memory with padding bytes.
1815    pub fn set_bytes_rev<M: Matrix>(&mut self, b: &[u8], block_size: usize) {
1816        self.s[0] = M::convert_char(M::NULL);
1817        self.s[1..1 + b.len()].copy_from_slice(b);
1818        self.s[1..1 + b.len()].reverse();
1819        self.s[1..1 + b.len()].iter_mut().for_each(|c| *c = M::convert_char(*c));
1820        self.s[1 + b.len()..1 + b.len() + block_size].fill(M::convert_char(M::NULL));
1821        self.len = b.len();
1822    }
1823
1824    /// Create from a byte slice.
1825    ///
1826    /// Make sure that `block_size` is greater than or equal to the upper bound
1827    /// block size used in the `Block::align` function.
1828    #[inline]
1829    pub fn from_bytes<M: Matrix>(b: &[u8], block_size: usize) -> Self {
1830        let mut v = b.to_owned();
1831        let len = v.len();
1832        v.insert(0, M::NULL);
1833        v.resize(v.len() + block_size, M::NULL);
1834        v.iter_mut().for_each(|c| *c = M::convert_char(*c));
1835        Self { s: v, len }
1836    }
1837
1838    /// Create from the bytes in a string slice.
1839    ///
1840    /// Make sure that `block_size` is greater than or equal to the upper bound
1841    /// block size used in the `Block::align` function.
1842    #[inline]
1843    pub fn from_str<M: Matrix>(s: &str, block_size: usize) -> Self {
1844        Self::from_bytes::<M>(s.as_bytes(), block_size)
1845    }
1846
1847    /// Create from the bytes in a string.
1848    ///
1849    /// Make sure that `block_size` is greater than or equal to the upper bound
1850    /// block size used in the `Block::align` function.
1851    #[inline]
1852    pub fn from_string<M: Matrix>(s: String, block_size: usize) -> Self {
1853        let mut v = s.into_bytes();
1854        let len = v.len();
1855        v.insert(0, M::NULL);
1856        v.resize(v.len() + block_size, M::NULL);
1857        v.iter_mut().for_each(|c| *c = M::convert_char(*c));
1858        Self { s: v, len }
1859    }
1860
1861    /// Get the byte at a certain index (unchecked).
1862    #[inline]
1863    pub unsafe fn get(&self, i: usize) -> u8 {
1864        *self.s.as_ptr().add(i)
1865    }
1866
1867    /// Set the byte at a certain index (unchecked).
1868    #[inline]
1869    pub unsafe fn set(&mut self, i: usize, c: u8) {
1870        *self.s.as_mut_ptr().add(i) = c;
1871    }
1872
1873    /// Create a pointer to a specific index.
1874    #[inline]
1875    pub unsafe fn as_ptr(&self, i: usize) -> *const u8 {
1876        self.s.as_ptr().add(i)
1877    }
1878
1879    /// Length of the original string (no padding).
1880    #[inline]
1881    pub fn len(&self) -> usize {
1882        self.len
1883    }
1884}
1885
1886/// Resulting score and alignment end position.
1887#[repr(C)]
1888#[derive(Copy, Clone, PartialEq, Debug)]
1889pub struct AlignResult {
1890    pub score: i32,
1891    pub query_idx: usize,
1892    pub reference_idx: usize
1893}
1894
1895#[derive(Copy, Clone, PartialEq, Debug)]
1896enum Direction {
1897    Right,
1898    Down,
1899    Grow
1900}
1901
1902#[cfg(test)]
1903mod tests {
1904    use crate::scores::*;
1905
1906    use super::*;
1907
1908    #[test]
1909    fn test_no_x_drop() {
1910        let test_gaps = Gaps { open: -11, extend: -1 };
1911
1912        let mut a = Block::<false, false>::new(100, 100, 16);
1913
1914        let r = PaddedBytes::from_bytes::<AAMatrix>(b"", 16);
1915        let q = PaddedBytes::from_bytes::<AAMatrix>(b"", 16);
1916        a.align(&q, &r, &BLOSUM62, test_gaps, 16..=16, 0);
1917        assert_eq!(a.res().score, 0);
1918
1919        let r = PaddedBytes::from_bytes::<AAMatrix>(b"AAAA", 16);
1920        let q = PaddedBytes::from_bytes::<AAMatrix>(b"", 16);
1921        a.align(&q, &r, &BLOSUM62, test_gaps, 16..=16, 0);
1922        assert_eq!(a.res().score, -14);
1923
1924        let r = PaddedBytes::from_bytes::<AAMatrix>(b"", 16);
1925        let q = PaddedBytes::from_bytes::<AAMatrix>(b"AAAA", 16);
1926        a.align(&q, &r, &BLOSUM62, test_gaps, 16..=16, 0);
1927        assert_eq!(a.res().score, -14);
1928
1929        let r = PaddedBytes::from_bytes::<AAMatrix>(b"AAAA", 16);
1930        let q = PaddedBytes::from_bytes::<AAMatrix>(b"AARA", 16);
1931        a.align(&q, &r, &BLOSUM62, test_gaps, 16..=16, 0);
1932        assert_eq!(a.res().score, 11);
1933
1934        let r = PaddedBytes::from_bytes::<AAMatrix>(b"AAAAAAAA", 16);
1935        let q = PaddedBytes::from_bytes::<AAMatrix>(b"AARAAAA", 16);
1936        a.align(&q, &r, &BLOSUM62, test_gaps, 16..=16, 0);
1937        assert_eq!(a.res().score, 12);
1938
1939        let r = PaddedBytes::from_bytes::<AAMatrix>(b"AAAA", 16);
1940        let q = PaddedBytes::from_bytes::<AAMatrix>(b"AAAA", 16);
1941        a.align(&q, &r, &BLOSUM62, test_gaps, 16..=16, 0);
1942        assert_eq!(a.res().score, 16);
1943
1944        let r = PaddedBytes::from_bytes::<AAMatrix>(b"AAAA", 16);
1945        let q = PaddedBytes::from_bytes::<AAMatrix>(b"AARA", 16);
1946        a.align(&q, &r, &BLOSUM62, test_gaps, 16..=16, 0);
1947        assert_eq!(a.res().score, 11);
1948
1949        let r = PaddedBytes::from_bytes::<AAMatrix>(b"AAAA", 16);
1950        let q = PaddedBytes::from_bytes::<AAMatrix>(b"RRRR", 16);
1951        a.align(&q, &r, &BLOSUM62, test_gaps, 16..=16, 0);
1952        assert_eq!(a.res().score, -4);
1953
1954        let r = PaddedBytes::from_bytes::<AAMatrix>(b"AAAA", 16);
1955        let q = PaddedBytes::from_bytes::<AAMatrix>(b"AAA", 16);
1956        a.align(&q, &r, &BLOSUM62, test_gaps, 16..=16, 0);
1957        assert_eq!(a.res().score, 1);
1958
1959        let test_gaps2 = Gaps { open: -2, extend: -1 };
1960
1961        let r = PaddedBytes::from_bytes::<NucMatrix>(b"AAAN", 16);
1962        let q = PaddedBytes::from_bytes::<NucMatrix>(b"ATAA", 16);
1963        a.align(&q, &r, &NW1, test_gaps2, 16..=16, 0);
1964        assert_eq!(a.res().score, 0);
1965
1966        let r = PaddedBytes::from_bytes::<NucMatrix>(b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 16);
1967        let q = PaddedBytes::from_bytes::<NucMatrix>(b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 16);
1968        a.align(&q, &r, &NW1, test_gaps2, 16..=16, 0);
1969        assert_eq!(a.res().score, 32);
1970
1971        let r = PaddedBytes::from_bytes::<NucMatrix>(b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 16);
1972        let q = PaddedBytes::from_bytes::<NucMatrix>(b"TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT", 16);
1973        a.align(&q, &r, &NW1, test_gaps2, 16..=16, 0);
1974        assert_eq!(a.res().score, -32);
1975
1976        let r = PaddedBytes::from_bytes::<NucMatrix>(b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 16);
1977        let q = PaddedBytes::from_bytes::<NucMatrix>(b"TATATATATATATATATATATATATATATATA", 16);
1978        a.align(&q, &r, &NW1, test_gaps2, 16..=16, 0);
1979        assert_eq!(a.res().score, 0);
1980
1981        let r = PaddedBytes::from_bytes::<NucMatrix>(b"TTAAAAAAATTTTTTTTTTTT", 16);
1982        let q = PaddedBytes::from_bytes::<NucMatrix>(b"TTTTTTTTAAAAAAATTTTTTTTT", 16);
1983        a.align(&q, &r, &NW1, test_gaps2, 16..=16, 0);
1984        assert_eq!(a.res().score, 7);
1985
1986        let r = PaddedBytes::from_bytes::<NucMatrix>(b"AAAA", 16);
1987        let q = PaddedBytes::from_bytes::<NucMatrix>(b"C", 16);
1988        a.align(&q, &r, &NW1, test_gaps2, 16..=16, 0);
1989        assert_eq!(a.res().score, -5);
1990        a.align(&r, &q, &NW1, test_gaps2, 16..=16, 0);
1991        assert_eq!(a.res().score, -5);
1992    }
1993
1994    #[test]
1995    fn test_x_drop() {
1996        let test_gaps = Gaps { open: -11, extend: -1 };
1997
1998        let mut a = Block::<false, true>::new(100, 100, 16);
1999
2000        let r = PaddedBytes::from_bytes::<AAMatrix>(b"", 16);
2001        let q = PaddedBytes::from_bytes::<AAMatrix>(b"", 16);
2002        a.align(&q, &r, &BLOSUM62, test_gaps, 16..=16, 1);
2003        assert_eq!(a.res(), AlignResult { score: 0, query_idx: 0, reference_idx: 0 });
2004
2005        let r = PaddedBytes::from_bytes::<AAMatrix>(b"AAAA", 16);
2006        let q = PaddedBytes::from_bytes::<AAMatrix>(b"", 16);
2007        a.align(&q, &r, &BLOSUM62, test_gaps, 16..=16, 1);
2008        assert_eq!(a.res(), AlignResult { score: 0, query_idx: 0, reference_idx: 0 });
2009
2010        let r = PaddedBytes::from_bytes::<AAMatrix>(b"", 16);
2011        let q = PaddedBytes::from_bytes::<AAMatrix>(b"AAAA", 16);
2012        a.align(&q, &r, &BLOSUM62, test_gaps, 16..=16, 1);
2013        assert_eq!(a.res(), AlignResult { score: 0, query_idx: 0, reference_idx: 0 });
2014
2015        let r = PaddedBytes::from_bytes::<AAMatrix>(b"AAARRA", 16);
2016        let q = PaddedBytes::from_bytes::<AAMatrix>(b"AAAAAA", 16);
2017        a.align(&q, &r, &BLOSUM62, test_gaps, 16..=16, 1);
2018        assert_eq!(a.res(), AlignResult { score: 14, query_idx: 6, reference_idx: 6 });
2019
2020        let r = PaddedBytes::from_bytes::<AAMatrix>(b"AAAAAAAAAAAAAAARRRRRRRRRRRRRRRRAAAAAAAAAAAAA", 16);
2021        let q = PaddedBytes::from_bytes::<AAMatrix>(b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 16);
2022        a.align(&q, &r, &BLOSUM62, test_gaps, 16..=16, 1);
2023        assert_eq!(a.res(), AlignResult { score: 60, query_idx: 15, reference_idx: 15 });
2024
2025        let mut a = Block::<true, true>::new(2048, 2048, 2048);
2026        let long_str = std::iter::repeat(b'A').take(2048).collect::<Vec<_>>();
2027        let r = PaddedBytes::from_bytes::<AAMatrix>(&long_str, 2048);
2028        let q = PaddedBytes::from_bytes::<AAMatrix>(&long_str, 2048);
2029        a.align(&q, &r, &BLOSUM62, test_gaps, 2048..=2048, 100);
2030        assert_eq!(a.res(), AlignResult { score: 8192, query_idx: 2048, reference_idx: 2048 });
2031
2032        let mut a = Block::<true, true>::new(0, 0, 16);
2033
2034        let r = PaddedBytes::from_bytes::<AAMatrix>(b"", 16);
2035        let q = PaddedBytes::from_bytes::<AAMatrix>(b"", 16);
2036        a.align(&q, &r, &BLOSUM62, test_gaps, 16..=16, 1);
2037        assert_eq!(a.res(), AlignResult { score: 0, query_idx: 0, reference_idx: 0 });
2038
2039        let mut a = Block::<true, true>::new(4, 4, 16);
2040
2041        let r = PaddedBytes::from_bytes::<AAMatrix>(b"AAAA", 16);
2042        let q = PaddedBytes::from_bytes::<AAMatrix>(b"", 16);
2043        a.align(&q, &r, &BLOSUM62, test_gaps, 16..=16, 1);
2044        assert_eq!(a.res(), AlignResult { score: 0, query_idx: 0, reference_idx: 0 });
2045
2046        let r = PaddedBytes::from_bytes::<AAMatrix>(b"", 16);
2047        let q = PaddedBytes::from_bytes::<AAMatrix>(b"AAAA", 16);
2048        a.align(&q, &r, &BLOSUM62, test_gaps, 16..=16, 1);
2049        assert_eq!(a.res(), AlignResult { score: 0, query_idx: 0, reference_idx: 0 });
2050    }
2051
2052    #[test]
2053    fn test_trace() {
2054        let test_gaps = Gaps { open: -11, extend: -1 };
2055
2056        let mut cigar = Cigar::new(100, 100);
2057
2058        let mut a = Block::<true, false>::new(100, 100, 16);
2059
2060        let r = PaddedBytes::from_bytes::<AAMatrix>(b"AAARRA", 16);
2061        let q = PaddedBytes::from_bytes::<AAMatrix>(b"AAAAAA", 16);
2062        a.align(&q, &r, &BLOSUM62, test_gaps, 16..=16, 0);
2063        let res = a.res();
2064        assert_eq!(res, AlignResult { score: 14, query_idx: 6, reference_idx: 6 });
2065        a.trace().cigar_eq(&q, &r, res.query_idx, res.reference_idx, &mut cigar);
2066        assert_eq!(cigar.to_string(), "3=2X1=");
2067
2068        let r = PaddedBytes::from_bytes::<AAMatrix>(b"AAAA", 16);
2069        let q = PaddedBytes::from_bytes::<AAMatrix>(b"AAA", 16);
2070        a.align(&q, &r, &BLOSUM62, test_gaps, 16..=16, 0);
2071        let res = a.res();
2072        assert_eq!(res, AlignResult { score: 1, query_idx: 3, reference_idx: 4 });
2073        a.trace().cigar(res.query_idx, res.reference_idx, &mut cigar);
2074        assert_eq!(cigar.to_string(), "3M1D");
2075
2076        let test_gaps2 = Gaps { open: -2, extend: -1 };
2077
2078        let r = PaddedBytes::from_bytes::<NucMatrix>(b"TTAAAAAAATTTTTTTTTTTT", 16);
2079        let q = PaddedBytes::from_bytes::<NucMatrix>(b"TTTTTTTTAAAAAAATTTTTTTTT", 16);
2080        a.align(&q, &r, &NW1, test_gaps2, 16..=16, 0);
2081        let res = a.res();
2082        assert_eq!(res, AlignResult { score: 7, query_idx: 24, reference_idx: 21 });
2083        a.trace().cigar(res.query_idx, res.reference_idx, &mut cigar);
2084        assert_eq!(cigar.to_string(), "2M6I16M3D");
2085
2086        let mut a = Block::<true, false>::new(100, 100, 32);
2087        let q = PaddedBytes::from_bytes::<NucMatrix>(b"AAAAAAAAATTGCGCT", 32);
2088        let r = PaddedBytes::from_bytes::<NucMatrix>(b"AAAAAAAAAGCGC", 32);
2089
2090        a.align(&q, &r, &NW1, test_gaps2, 32..=32, 0);
2091        let res = a.res();
2092        assert_eq!(res, AlignResult { score: 8, query_idx: 16, reference_idx: 13 });
2093        a.trace().cigar_eq(&q, &r, res.query_idx, res.reference_idx, &mut cigar);
2094        assert_eq!(cigar.to_string(), "9=2I4=1I");
2095
2096        let matrix = NucMatrix::new_simple(2, -1);
2097        let test_gaps3 = Gaps { open: -5, extend: -2 };
2098        a.align(&q, &r, &matrix, test_gaps3, 32..=32, 0);
2099        let res = a.res();
2100        assert_eq!(res, AlignResult { score: 14, query_idx: 16, reference_idx: 13 });
2101        a.trace().cigar_eq(&q, &r, res.query_idx, res.reference_idx, &mut cigar);
2102        assert_eq!(cigar.to_string(), "9=2I4=1I");
2103    }
2104
2105    #[test]
2106    fn test_bytes() {
2107        let test_gaps = Gaps { open: -2, extend: -1 };
2108
2109        let mut a = Block::<false, false>::new(100, 100, 16);
2110
2111        let r = PaddedBytes::from_bytes::<ByteMatrix>(b"AAAaaA", 16);
2112        let q = PaddedBytes::from_bytes::<ByteMatrix>(b"AAAAAA", 16);
2113        a.align(&q, &r, &BYTES1, test_gaps, 16..=16, 0);
2114        assert_eq!(a.res().score, 2);
2115
2116        let r = PaddedBytes::from_bytes::<ByteMatrix>(b"abcdefg", 16);
2117        let q = PaddedBytes::from_bytes::<ByteMatrix>(b"abdefg", 16);
2118        a.align(&q, &r, &BYTES1, test_gaps, 16..=16, 0);
2119        assert_eq!(a.res().score, 4);
2120    }
2121
2122    #[test]
2123    fn test_profile() {
2124        let mut a = Block::<false, false>::new(100, 100, 16);
2125        let r = AAProfile::from_bytes(b"AAAA", 16, 1, -1, -1, 0, -1, -1);
2126        let q = PaddedBytes::from_bytes::<AAMatrix>(b"AAAA", 16);
2127        a.align_profile(&q, &r, 16..=16, 0);
2128        assert_eq!(a.res().score, 4);
2129
2130        let r = AAProfile::from_bytes(b"AATTAA", 16, 1, -1, -1, 0, -1, -1);
2131        let q = PaddedBytes::from_bytes::<AAMatrix>(b"AAAA", 16);
2132        a.align_profile(&q, &r, 16..=16, 0);
2133        assert_eq!(a.res().score, 1);
2134
2135        let r = AAProfile::from_bytes(b"AATTAA", 16, 1, -1, -1, -1, -1, -1);
2136        let q = PaddedBytes::from_bytes::<AAMatrix>(b"AAAA", 16);
2137        a.align_profile(&q, &r, 16..=16, 0);
2138        assert_eq!(a.res().score, 0);
2139
2140        let mut a = Block::<true, false>::new(100, 100, 16);
2141        let mut cigar = Cigar::new(100, 100);
2142
2143        let r = AAProfile::from_bytes(b"TTAAAAAAATTTTTTTTTTTT", 16, 1, -1, -1, 0, -1, -1);
2144        let q = PaddedBytes::from_bytes::<AAMatrix>(b"TTTTTTTTAAAAAAATTTTTTTTT", 16);
2145        a.align_profile(&q, &r, 16..=16, 0);
2146        let res = a.res();
2147        assert_eq!(res, AlignResult { score: 7, query_idx: 24, reference_idx: 21 });
2148        a.trace().cigar(res.query_idx, res.reference_idx, &mut cigar);
2149        assert_eq!(cigar.to_string(), "2M6I16M3D");
2150
2151        let r = AAProfile::from_bytes(b"TTAAAAAAATTTTTTTTTTTT", 16, 1, -1, -1, -1, -1, -1);
2152        let q = PaddedBytes::from_bytes::<AAMatrix>(b"TTTTTTTTAAAAAAATTTTTTTTT", 16);
2153        a.align_profile(&q, &r, 16..=16, 0);
2154        let res = a.res();
2155        assert_eq!(res, AlignResult { score: 6, query_idx: 24, reference_idx: 21 });
2156        a.trace().cigar(res.query_idx, res.reference_idx, &mut cigar);
2157        assert_eq!(cigar.to_string(), "2M6I16M3D");
2158
2159        let mut r = AAProfile::from_bytes(b"TTAAAAAAATTTTTTTTTTTT", 16, 1, -1, -2, -1, -1, -1);
2160        r.set_gap_close_C(17, -1);
2161        r.set_gap_close_C(19, 0);
2162        let q = PaddedBytes::from_bytes::<AAMatrix>(b"TTTTTTTTAAAAAAATTTTTTTTT", 16);
2163        a.align_profile(&q, &r, 16..=16, 0);
2164        let res = a.res();
2165        assert_eq!(res, AlignResult { score: 6, query_idx: 24, reference_idx: 21 });
2166        a.trace().cigar(res.query_idx, res.reference_idx, &mut cigar);
2167        assert_eq!(cigar.to_string(), "2M6I14M3D2M");
2168    }
2169
2170    #[test]
2171    fn test_local_and_free_query_gaps() {
2172        let test_gaps = Gaps { open: -2, extend: -1 };
2173
2174        let mut local = Block::<true, false, true, false>::new(100, 100, 32);
2175        let mut cigar = Cigar::new(100, 100);
2176
2177        let r = PaddedBytes::from_bytes::<NucMatrix>(b"TTTTAAAAAA", 32);
2178        let q = PaddedBytes::from_bytes::<NucMatrix>(b"CCCCCCCCCCAAAAAA", 32);
2179        local.align(&q, &r, &NW1, test_gaps, 32..=32, 0);
2180        let res = local.res();
2181        assert_eq!(res, AlignResult { score: 6, query_idx: 16, reference_idx: 10 });
2182        local.trace().cigar_eq(&q, &r, res.query_idx, res.reference_idx, &mut cigar);
2183        assert_eq!(cigar.to_string(), "6=");
2184
2185        let mut local = Block::<true, true, true, false>::new(100, 100, 32);
2186
2187        let r = PaddedBytes::from_bytes::<NucMatrix>(b"TTTTAAAAAATTTTTTT", 32);
2188        let q = PaddedBytes::from_bytes::<NucMatrix>(b"CCCCCCCCCCAAAAAACCCCCCCCCCCC", 32);
2189        local.align(&q, &r, &NW1, test_gaps, 32..=32, 100);
2190        let res = local.res();
2191        assert_eq!(res, AlignResult { score: 6, query_idx: 16, reference_idx: 10 });
2192        local.trace().cigar_eq(&q, &r, res.query_idx, res.reference_idx, &mut cigar);
2193        assert_eq!(cigar.to_string(), "6=");
2194
2195        let mut q_start = Block::<true, false, false, true>::new(100, 100, 32);
2196
2197        let r = PaddedBytes::from_bytes::<NucMatrix>(b"CCCCCCCCCCAAAAAA", 32);
2198        let q = PaddedBytes::from_bytes::<NucMatrix>(b"AAAAAA", 32);
2199        q_start.align(&q, &r, &NW1, test_gaps, 32..=32, 0);
2200        let res = q_start.res();
2201        assert_eq!(res, AlignResult { score: 6, query_idx: 6, reference_idx: 16 });
2202        q_start.trace().cigar_eq(&q, &r, res.query_idx, res.reference_idx, &mut cigar);
2203        assert_eq!(cigar.to_string(), "6=");
2204
2205        let r = PaddedBytes::from_bytes::<NucMatrix>(b"CCCCCCCCCCAAATAA", 32);
2206        let q = PaddedBytes::from_bytes::<NucMatrix>(b"AAAAAA", 32);
2207        q_start.align(&q, &r, &NW1, test_gaps, 32..=32, 0);
2208        let res = q_start.res();
2209        assert_eq!(res, AlignResult { score: 4, query_idx: 6, reference_idx: 16 });
2210        q_start.trace().cigar_eq(&q, &r, res.query_idx, res.reference_idx, &mut cigar);
2211        assert_eq!(cigar.to_string(), "3=1X2=");
2212
2213        let mut q_end = Block::<true, false, false, false, true>::new(100, 100, 32);
2214
2215        let r = PaddedBytes::from_bytes::<NucMatrix>(b"AAAAAACCCCCCCCCC", 32);
2216        let q = PaddedBytes::from_bytes::<NucMatrix>(b"AAAAAA", 32);
2217        q_end.align(&q, &r, &NW1, test_gaps, 32..=32, 0);
2218        let res = q_end.res();
2219        assert_eq!(res, AlignResult { score: 6, query_idx: 6, reference_idx: 6 });
2220        q_end.trace().cigar_eq(&q, &r, res.query_idx, res.reference_idx, &mut cigar);
2221        assert_eq!(cigar.to_string(), "6=");
2222
2223        let r = PaddedBytes::from_bytes::<NucMatrix>(b"AAATAACCCCCCCCCC", 32);
2224        let q = PaddedBytes::from_bytes::<NucMatrix>(b"AAAAAA", 32);
2225        q_end.align(&q, &r, &NW1, test_gaps, 32..=32, 0);
2226        let res = q_end.res();
2227        assert_eq!(res, AlignResult { score: 4, query_idx: 6, reference_idx: 6 });
2228        q_end.trace().cigar_eq(&q, &r, res.query_idx, res.reference_idx, &mut cigar);
2229        assert_eq!(cigar.to_string(), "3=1X2=");
2230    }
2231}