#ifndef BAND_STRIPED_DNA_SEQ_ALIGNMENT_RJ_H
#define BAND_STRIPED_DNA_SEQ_ALIGNMENT_RJ_H
#include "list.h"
#include "sort.h"
#ifdef __AVX2__
#include <immintrin.h>
#else
#include <emmintrin.h>
#include <smmintrin.h>
#include <tmmintrin.h>
#endif
#define SEQALIGN_MODE_GLOBAL 0
#define SEQALIGN_MODE_OVERLAP 1
#define SEQALIGN_MODE_EXTEND 2
#define SEQALIGN_MODE_KMER 3
#define SEQALIGN_MODEMASK_TYPE 0x3
#define SEQALIGN_MODE_QPROF 4
#define SEQALIGN_MODE_MEMRESV 8
#define SEQALIGN_MODE_CIGRESV 16
#define seqalign_mode_type(mode) ((mode) & SEQALIGN_MODEMASK_TYPE)
#define SEQALIGN_BT_M 0
#define SEQALIGN_BT_I 1
#define SEQALIGN_BT_D 2
#define SEQALIGN_BT1_IE 4
#define SEQALIGN_BT1_DE 8
#define SEQALIGN_BT2_I1 1
#define SEQALIGN_BT2_D1 2
#define SEQALIGN_BT2_I2 3
#define SEQALIGN_BT2_D2 4
#define SEQALIGN_BT2_IE1 8
#define SEQALIGN_BT2_DE1 16
#define SEQALIGN_BT2_IE2 32
#define SEQALIGN_BT2_DE2 64
#define SEQALIGN_SCORE_EPI8_MIN (-(MAX_B1 >> 1))
#define SEQALIGN_SCORE_EPI8_MAX (MAX_B1 >> 1)
#define SEQALIGN_SCORE_MIN (-(MAX_B4 >> 2))
#define SEQALIGN_SCORE_MAX (MAX_B4 >> 2)
#define SEQALIGN_CIGAR_M 0
#define SEQALIGN_CIGAR_I 1
#define SEQALIGN_CIGAR_D 2
#define SEQALIGN_CIGAR_N 3
#define SEQALIGN_CIGAR_S 4
#define SEQALIGN_CIGAR_H 5
#define SEQALIGN_CIGAR_P 6
#define SEQALIGN_CIGAR_E 7
#define SEQALIGN_CIGAR_X 8
#ifdef __AVX2__
#define WORDSIZE 32
#define WORDSHIFT 5
typedef __m256i xint;
#define mm_load _mm256_load_si256
#define mm_loadu _mm256_loadu_si256
#define mm_store _mm256_store_si256
#define mm_storeu _mm256_storeu_si256
#define mm_or _mm256_or_si256
#define mm_xor _mm256_xor_si256
#define mm_and _mm256_and_si256
#define mm_andnot _mm256_andnot_si256
#define mm_setzero _mm256_setzero_si256
#define mm_set1_epi8 _mm256_set1_epi8
#define mm_set1_epi16 _mm256_set1_epi16
#define mm_set1_epi32 _mm256_set1_epi32
#define mm_set1_epi64x _mm256_set1_epi64x
#define mm_srli _mm256_srli_si256
#define mm_slli _mm256_slli_si256
#define mm_srli_epi32 _mm256_srli_epi32
#define mm_srai_epi32 _mm256_srai_epi32
#define mm_slli_epi32 _mm256_slli_epi32
#define mm_slai_epi32 _mm256_slai_epi32
#define mm_srli_epi64 _mm256_srli_epi64
#define mm_srai_epi64 _mm256_srai_epi64
#define mm_slli_epi64 _mm256_slli_epi64
#define mm_slai_epi64 _mm256_slai_epi64
#define mm_insert_epi8 _mm256_insert_epi8
#define mm_extract_epi16 _mm256_extract_epi16
#define mm_extract_epi32 _mm256_extract_epi32
#define mm_adds_epi8 _mm256_adds_epi8
#define mm_adds_epu8 _mm256_adds_epu8
#define mm_adds_epi16 _mm256_adds_epi16
#define mm_add_epi32 _mm256_add_epi32
#define mm_subs_epi8 _mm256_subs_epi8
#define mm_subs_epu8 _mm256_subs_epu8
#define mm_subs_epi16 _mm256_subs_epi16
#define mm_sub_epi32 _mm256_sub_epi32
#define mm_cmpeq_epi8 _mm256_cmpeq_epi8
#define mm_cmpgt_epi8 _mm256_cmpgt_epi8
#define mm_cmpeq_epi16 _mm256_cmpeq_epi16
#define mm_cmpgt_epi16 _mm256_cmpgt_epi16
#define mm_cmpeq_epi32 _mm256_cmpeq_epi32
#define mm_cmpgt_epi32 _mm256_cmpgt_epi32
#define mm_movemask_epi8 _mm256_movemask_epi8
#define mm_max_epi8 _mm256_max_epi8
#define mm_min_epi8 _mm256_min_epi8
#define mm_max_epu8 _mm256_max_epu8
#define mm_min_epu8 _mm256_min_epu8
#define mm_max_epi16 _mm256_max_epi16
#define mm_min_epi16 _mm256_min_epi16
#define mm_max_epi32 _mm256_max_epi32
#define mm_min_epi32 _mm256_min_epi32
#define mm_shuffle _mm256_shuffle_epi8
#define mm_blendv _mm256_blendv_epi8
#define mm_cvtepi8x0_epi16(a) _mm256_cvtepi8_epi16(_mm256_castsi256_si128(a))
#define mm_cvtepi8x1_epi16(a) _mm256_cvtepi8_epi16(_mm256_castsi256_si128(_mm256_srli_si256(a, 16)))
#define mm_cvtepi16x0_epi32(a) _mm256_cvtepi16_epi32(_mm256_castsi256_si128(a))
#define mm_cvtepi16x1_epi32(a) _mm256_cvtepi16_epi32(_mm256_castsi256_si128(_mm256_srli_si256(a, 16)))
#define mm_cvtepi8x0_epi32(a) _mm256_cvtepi8_epi32(_mm256_castsi256_si128(a))
#define mm_cvtepi8x1_epi32(a) _mm256_cvtepi8_epi16(_mm256_castsi256_si128(_mm256_srli_si256(a, 8)))
#define mm_cvtepi8x2_epi32(a) _mm256_cvtepi8_epi16(_mm256_castsi256_si128(_mm256_srli_si256(a, 16)))
#define mm_cvtepi8x3_epi32(a) _mm256_cvtepi8_epi16(_mm256_castsi256_si128(_mm256_srli_si256(a, 24)))
#define mm_packs_epi32 _mm256_packs_epi32
#define mm_packs_epi16 _mm256_packs_epi16
#else
#define WORDSIZE 16
#define WORDSHIFT 4
typedef __m128i xint;
#define mm_load _mm_load_si128
#define mm_loadu _mm_loadu_si128
#define mm_store _mm_store_si128
#define mm_storeu _mm_storeu_si128
#define mm_or _mm_or_si128
#define mm_xor _mm_xor_si128
#define mm_and _mm_and_si128
#define mm_andnot _mm_andnot_si128
#define mm_setzero _mm_setzero_si128
#define mm_set1_epi8 _mm_set1_epi8
#define mm_set1_epi16 _mm_set1_epi16
#define mm_set1_epi32 _mm_set1_epi32
#define mm_set1_epi64x _mm_set1_epi64x
#define mm_srli _mm_srli_si128
#define mm_slli _mm_slli_si128
#define mm_srli_epi32 _mm_srli_epi32
#define mm_srai_epi32 _mm_srai_epi32
#define mm_slli_epi32 _mm_slli_epi32
#define mm_slai_epi32 _mm_slai_epi32
#define mm_srli_epi64 _mm_srli_epi64
#define mm_srai_epi64 _mm_srai_epi64
#define mm_slli_epi64 _mm_slli_epi64
#define mm_slai_epi64 _mm_slai_epi64
#define mm_insert_epi8 _mm_insert_epi8
#define mm_extract_epi16 _mm_extract_epi16
#define mm_extract_epi32 _mm_extract_epi32
#define mm_adds_epi8 _mm_adds_epi8
#define mm_adds_epu8 _mm_adds_epu8
#define mm_adds_epi16 _mm_adds_epi16
#define mm_add_epi32 _mm_add_epi32
#define mm_subs_epi8 _mm_subs_epi8
#define mm_subs_epu8 _mm_subs_epu8
#define mm_subs_epi16 _mm_subs_epi16
#define mm_sub_epi32 _mm_sub_epi32
#define mm_cmpeq_epi8 _mm_cmpeq_epi8
#define mm_cmpgt_epi8 _mm_cmpgt_epi8
#define mm_cmpeq_epi16 _mm_cmpeq_epi16
#define mm_cmpgt_epi16 _mm_cmpgt_epi16
#define mm_cmpeq_epi32 _mm_cmpeq_epi32
#define mm_cmpgt_epi32 _mm_cmpgt_epi32
#define mm_movemask_epi8 _mm_movemask_epi8
#define mm_max_epi8 _mm_max_epi8
#define mm_min_epi8 _mm_min_epi8
#define mm_max_epu8 _mm_max_epu8
#define mm_min_epu8 _mm_min_epu8
#define mm_max_epi16 _mm_max_epi16
#define mm_min_epi16 _mm_min_epi16
#define mm_max_epi32 _mm_max_epi32
#define mm_min_epi32 _mm_min_epi32
#define mm_shuffle _mm_shuffle_epi8
#define mm_blendv _mm_blendv_epi8
#define mm_cvtepi8x0_epi16(a) _mm_cvtepi8_epi16(a)
#define mm_cvtepi8x1_epi16(a) _mm_cvtepi8_epi16(_mm_srli_si128(a, 8))
#define mm_cvtepi16x0_epi32(a) _mm_cvtepi16_epi32(a)
#define mm_cvtepi16x1_epi32(a) _mm_cvtepi16_epi32(_mm_srli_si128(a, 8))
#define mm_cvtepi8x0_epi32(a) _mm_cvtepi8_epi32(a)
#define mm_cvtepi8x1_epi32(a) _mm_cvtepi8_epi32(_mm_srli_si128(a, 4))
#define mm_cvtepi8x2_epi32(a) _mm_cvtepi8_epi32(_mm_srli_si128(a, 8))
#define mm_cvtepi8x3_epi32(a) _mm_cvtepi8_epi32(_mm_srli_si128(a, 12))
#define mm_packs_epi32 _mm_packs_epi32
#define mm_packs_epi16 _mm_packs_epi16
#endif
#define mm_sr1bit(x) mm_or(mm_slli_epi64(mm_srli(x, 8), 63), mm_srli_epi64(x, 1))
#define mm_sl1bit(x) mm_or(mm_srli_epi64(mm_slli(x, 8), 63), mm_slli_epi64(x, 1))
typedef struct {
int score;
int qb, qe;
int tb, te;
int mat, mis, ins, del, aln;
} seqalign_result_t;
#define striped_seqedit_getval(xs, W, pos) (((xs)[(pos) % (W)] >> ((pos) / (W))) & 0x1)
#define striped_seqedit_qprof_size(qlen, bandwidth) ((num_max(qlen, bandwidth) + 1) * 4 * 8)
static inline void striped_seqedit_set_query_prof(u1i *qseq, u4i qlen, u4i bandwidth, u8i *qprof);
static inline void striped_seqedit_row_init(u8i *us[2], u4i W);
static inline void striped_seqedit_row_movx(u8i *us[2][2], u4i W, u4i movx, int mode, int *heading_score);
static inline void striped_seqedit_row_cal(u4i rbeg, u8i *us[2][2], u8i *hs, u8i *qprof, int mode, u4i W, u1i base);
static inline seqalign_result_t striped_seqedit_backtrace(u8i *uts[2], u4i *begs, u4i W, u1i *qseq, int qend, u1i *tseq, int tend, int mode, u4v *cigars);
static inline seqalign_result_t striped_seqedit_pairwise(u1i *qseq, u4i qlen, u1i *tseq, u4i tlen, int mode, u4i bandwidth, b1v *mempool, u4v *cigars, int verbose);
static inline seqalign_result_t kmer_striped_seqedit_pairwise(u1i ksz, u1i *qseq, u4i qlen, u1i *tseq, u4i tlen, b1v *mempool, u4v *cigars, int verbose);
#define striped_epi2_seqedit_getval(xs, W, pos) (((xs)[((((pos) % (W)) * WORDSIZE) + (((pos) / (W)) >> 3))] >> (((((pos) / ((W)))) & 0x7))) & 0x1)
#define striped_epi2_seqedit_qprof_size(qlen) (roundup_times(qlen, WORDSIZE * 8) / 2)
static inline void striped_epi2_seqedit_set_query_prof(u1i *qseq, u4i qlen, b1i *qprof);
static inline void striped_epi2_seqedit_row_init(b1i *us[2], u4i W, int mode);
static inline void striped_epi2_seqedit_row_cal(u4i rbeg, b1i *us[2][2], b1i *hs, b1i *qprof, u4i W, u1i base, int mode);
static inline seqalign_result_t striped_epi2_seqedit_backtrace(b1i *uts[2], u4i W, int mode, u1i *qseq, int qend, u1i *tseq, int tend, u4v *cigars);
static inline seqalign_result_t striped_epi2_seqedit_pairwise(u1i *qseq, u4i qlen, u1i *tseq, u4i tlen, b1v *mempool, u4v *cigars, int mode, int verbose);
#define banded_striped_epi8_pos2idx(bandwidth, pos) ((((pos) % (bandwidth >> WORDSHIFT)) << WORDSHIFT) + ((pos) / ((bandwidth) >> WORDSHIFT)))
static inline void banded_striped_epi8_seqalign_set_score_matrix(b1i matrix[16], b1i mat, b1i mis){ u4i i; for(i=0;i<16;i++) matrix[i] = ((i ^ (i >> 2)) & 0x3)? mis : mat; }
#define banded_striped_epi8_seqalign_qprof_size(qlen, bandwidth) (((num_max(qlen, bandwidth) + 1) * 4) * WORDSIZE)
#define banded_striped_epi8_seqalign_get_qprof_value(qprof, pos, base) (qprof)[((pos) * 4 + (base)) * WORDSIZE]
static inline void banded_striped_epi8_seqalign_set_query_prof_native(u1i *qseq, u4i qlen, b1i *qprof, u4i bandwidth, b1i mtx[16]);
static inline void banded_striped_epi8_seqalign_set_query_prof(u1i *qseq, u4i qlen, b1i *qprof, u4i bandwidth, b1i mtx[16]);
static inline void banded_striped_epi8_seqalign_set_query_prof_hpc(u1i *qseq, u4i qlen, b1i *qprof, u4i bandwidth, b1i mtx[16], b1i bonus);
static void banded_striped_epi8_seqalign_piecex_row_init(b1i *us, b1i *es, b1i *qs, int *ubegs, int *rbeg, int mode, u4i bandwidth, b1i max_nt, b1i min_nt, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2);
static inline void banded_striped_epi8_seqalign_piecex_row_movx(b1i *us[2], b1i *es[2], b1i *qs[2], int *ubegs[2], u4i W, u4i movx, int piecewise, b1i nt_max, b1i nt_min, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2);
static inline void banded_striped_epi8_seqalign_piecex_row_mov(b1i *us[2], b1i *es[2], b1i *qs[2], int *ubegs[2], u4i W, u4i mov, int piecewise, b1i nt_min, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2);
static inline int banded_striped_epi8_seqalign_piece0_row_btcal(u4i rbeg, u1i base, b1i *us[2], b1i *es[2], b1i *qs[2], b1i *bs, int *ubegs[2], b1i *qprof, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2, u4i W, u4i mov, int rh);
static inline int banded_striped_epi8_seqalign_piece1_row_btcal(u4i rbeg, u1i base, b1i *us[2], b1i *es[2], b1i *qs[2], b1i *bs, int *ubegs[2], b1i *qprof, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2, u4i W, u4i mov, int rh);
static inline int banded_striped_epi8_seqalign_piece2_row_btcal(u4i rbeg, u1i base, b1i *us[2], b1i *es[2], b1i *qs[2], b1i *bs, int *ubegs[2], b1i *qprof, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2, u4i W, u4i mov, int rh);
static inline int banded_striped_epi8_seqalign_piece0_row_cal(u4i rbeg, u1i base, b1i *us[2], b1i *es[2], b1i *qs[2], int *ubegs[2], b1i *qprof, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2, u4i W, u4i mov, int rh);
static inline int banded_striped_epi8_seqalign_piece1_row_cal(u4i rbeg, u1i base, b1i *us[2], b1i *es[2], b1i *qs[2], int *ubegs[2], b1i *qprof, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2, u4i W, u4i mov, int rh);
static inline int banded_striped_epi8_seqalign_piece2_row_cal(u4i rbeg, u1i base, b1i *us[2], b1i *es[2], b1i *qs[2], int *ubegs[2], b1i *qprof, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2, u4i W, u4i mov, int rh);
static inline int banded_striped_epi8_seqalign_piecex_row_cal(u4i rbeg, u1i base, b1i *us[2], b1i *es[2], b1i *qs[2], int *ubegs[2], b1i *qprof, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2, u4i W, u4i mov, int rh, int piecewise);
static inline void banded_striped_epi8_seqalign_piecex_row_merge(b1i *us[3], b1i *es[3], b1i *qs[3], int *ubegs[3], u4i W, int piecewise);
static inline int banded_striped_epi8_seqalign_piecex_row_verify(int rowidx, int rowoff, int mode, int W, int mov, u1i tbase, b1i *us[2], b1i *es[2], b1i *qs[2], int *ubegs[2], b1i *qprof, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2);
static inline u4i banded_striped_epi8_seqalign_row_max(b1i *us, int *ubegs, u4i W, int *max_score);
static inline int banded_striped_epi8_seqalign_band_mov(b1i *us, int *ubegs, u4i W, u4i tidx, u4i qoff, u4i qlen);
static inline u4i banded_striped_epi8_seqalign_band_comb(u4i cnt, u4i *qoffs, int **ubegs, b1v *mempool, u4i W, u4i qlen);
static inline int banded_striped_epi8_seqalign_getscore(b1i *us, int *ubegs, u4i W, u8i pos);
static inline u4i banded_striped_epi8_seqalign_piecex_backtrace(u1i *qseq, u1i *tseq, b1i *bs, int *begs, int mode, u4i bandwidth, int piecewise, seqalign_result_t *rs, u4v *cigars);
static inline u4i banded_striped_epi8_seqalign_piecex_backcal(u1i *qseq, u1i *tseq, b1i *ups, b1i *eps, b1i *qps, b1i *ubs, int *roffs, int mode, u4i bandwidth, b1i *matrix, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2, seqalign_result_t *rs, u4v *cigars);
static inline u4i seqalign_cigar2alnstr(u1i *qseq, u1i *tseq, seqalign_result_t *rs, u4v *cigars, char *alnstr[3], u4i length);
static inline void seqalign_cigar2alnstr_print(char *qtag, u1i *qseq, u4i qlen, char *ttag, u1i *tseq, u4i tlen, seqalign_result_t *rs, u4v *cigars, int linewidth, FILE *out);
static inline seqalign_result_t banded_striped_epi8_seqalign_pairwise(u1i *qseq, u4i qlen, u1i *tseq, u4i tlen, b1v *mempool, u4v *cigars, int mode, u4i bandwidth, b1i matrix[16], b1i gapo1, b1i gape1, b1i gapo2, b1i gape2, int verbose);
static inline void _push_cigar_u4v(u4v *cigars, u1i op, u4i sz){
if(cigars->size && (cigars->buffer[cigars->size - 1] & 0xf) == op){
cigars->buffer[cigars->size - 1] += sz << 4;
} else {
push_u4v(cigars, sz << 4 | op);
}
}
static inline u4i _push_cigar_bsalign(u4v *cigars, u4i cg, u1i op, u4i sz){
if(op == (cg & 0xf)){
cg += sz << 4;
} else {
if(cigars && cg) push_u4v(cigars, cg);
cg = sz << 4 | op;
}
return cg;
}
static inline int seqalign_iter_cigars(u4i *cigars, u4i size, u8i *iter_ptr){
u4i cgoff, opoff, op, sz;
cgoff = iter_ptr[0] >> 32;
opoff = iter_ptr[0] & 0xFFFFFFFFU;
while(cgoff < size){
op = cigars[cgoff] & 0x0f;
sz = cigars[cgoff] >> 4;
if(opoff >= sz){
cgoff ++;
opoff = 0;
continue;
}
opoff ++;
iter_ptr[0] = (((u8i)cgoff) << 32) | opoff;
return op;
}
iter_ptr[0] = (((u8i)cgoff) << 32) | opoff;
return -1;
}
#define pop_aln2cigar(){\
do {\
if(alns[0][p] == 5){\
if(alns[1][p] == 5){\
q = 2;\
break;\
} else {\
q = 0;\
}\
} else if(alns[1][p] == 5){\
q = 1;\
} else {\
break;\
}\
for(i=1;i<z;i++){\
if(alns[q][(p+i)%L] == alns[!q][p]){\
alns[q][p] = alns[!q][p];\
alns[q][(p+i)%L] = 5;\
ret ++;\
break;\
} else if(alns[q][(p+i)%L] != 5){\
break;\
}\
}\
} while(0);\
if(q == 2){\
} if(alns[0][p] == 5){\
cg = _push_cigar_bsalign(cigars, cg, SEQALIGN_CIGAR_D, 1);\
} else if(alns[1][p] == 5){\
cg = _push_cigar_bsalign(cigars, cg, SEQALIGN_CIGAR_I, 1);\
} else {\
cg = _push_cigar_bsalign(cigars, cg, SEQALIGN_CIGAR_M, 1);\
}\
p = (p + 1) % L;\
z --;\
}
static inline u4i seqalign_left_tidy_cigars(u1i *qseq, u1i *tseq, seqalign_result_t *rs, u4v *cigars){
u8i iter;
u4i cg, sz, ret;
u1i alns[2][64], *seqs[2];
int op, x[3][2], z, p, q, i, L;
iter = 0;
L = 64;
seqs[0] = qseq;
seqs[1] = tseq;
x[0][0] = x[1][0] = rs->qb;
x[0][1] = x[1][1] = rs->tb;
x[2][0] = x[2][1] = z = p = 0;
sz = cigars->size;
cg = 0;
ret = 0;
while((op = seqalign_iter_cigars(cigars->buffer, sz, &iter)) != -1){
switch(op){
case 0:
case 7:
case 8:
op = 3;
break;
case 1:
case 4:
op = 1;
break;
case 2:
case 3:
op = 2;
break;
}
if(z == L){
pop_aln2cigar();
}
q = (p + z) % L;
if(z < L) z ++;
for(i=0;i<2;i++){
if(op & (1 << i)){
alns[i][q] = seqs[i][x[0][i]++];
} else {
alns[i][q] = 5;
}
}
}
while(z){
pop_aln2cigar();
}
if(cg){
push_u4v(cigars, cg);
}
remove_array_u4v(cigars, 0, sz);
return ret;
}
static inline u4i seqalign_cigar2alnstr(u1i *qseq, u1i *tseq, seqalign_result_t *rs, u4v *cigars, char *alnstr[3], u4i length){
u4i i, j, x, y, z, op, sz;
if(alnstr == NULL) return 0;
if(length == 0){
length = rs->aln;
alnstr[0] = realloc(alnstr[0], length + 1);
alnstr[1] = realloc(alnstr[1], length + 1);
alnstr[2] = realloc(alnstr[2], length + 1);
}
z = 0;
x = rs->qb;
y = rs->tb;
for(i=0;i<cigars->size;i++){
op = cigars->buffer[i] & 0xf;
sz = cigars->buffer[i] >> 4;
sz = num_min(sz, length - z);
switch(op){
case 0:
case 7:
case 8:
for(j=0;j<sz;j++){
alnstr[2][z] = (qseq[x] == tseq[y])? '|' : '*';
alnstr[0][z] = "ACGTN-"[qseq[x ++]];
alnstr[1][z] = "ACGTN-"[tseq[y ++]];
z ++;
}
break;
case 1:
case 4:
for(j=0;j<sz;j++){
alnstr[2][z] = '-';
alnstr[0][z] = "ACGTN-"[qseq[x ++]];
alnstr[1][z] = '-';
z ++;
}
break;
case 2:
case 3:
for(j=0;j<sz;j++){
alnstr[2][z] = '-';
alnstr[0][z] = '-';
alnstr[1][z] = "ACGTN-"[tseq[y ++]];
z ++;
}
}
if(z == length) break;
}
alnstr[0][z] = 0;
alnstr[1][z] = 0;
alnstr[2][z] = 0;
return z;
}
static inline void seqalign_cigar2alnstr_print(char *qtag, u1i *qseq, u4i qlen, char *ttag, u1i *tseq, u4i tlen, seqalign_result_t *rs, u4v *cigars, int linewidth, FILE *out){
char *alnstr[3];
alnstr[0] = alnstr[1] = alnstr[2] = NULL;
seqalign_cigar2alnstr(qseq, tseq, rs, cigars, alnstr, 0);
fprintf(out, "%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%0.3f\t%d\t%d\t%d\t%d\n", qtag, qlen, rs->qb, rs->qe, ttag, tlen, rs->tb, rs->te, rs->score, 1.0 * rs->mat / num_max(rs->aln, 1), rs->mat, rs->mis, rs->ins, rs->del);
if(linewidth <= 0){
fprintf(out, "%s\n%s\n%s\n", alnstr[0], alnstr[2], alnstr[1]);
} else {
int i, b, e, qn, tn;
char tmp;
qn = rs->qb;
tn = rs->tb;
for(b=0;b<rs->aln;b+=linewidth){
e = num_min(b + linewidth, rs->aln);
for(i=b;i<e;i++){
if(alnstr[0][i] != '-') qn ++;
if(alnstr[1][i] != '-') tn ++;
}
tmp = alnstr[0][e]; alnstr[0][e] = 0; fprintf(stdout, "%s\tQ[%d]\n", alnstr[0] + b, qn); alnstr[0][e] = tmp;
tmp = alnstr[2][e]; alnstr[2][e] = 0; fprintf(stdout, "%s\n", alnstr[2] + b); alnstr[2][e] = tmp;
tmp = alnstr[1][e]; alnstr[1][e] = 0; fprintf(stdout, "%s\tT[%d]\n", alnstr[1] + b, tn); alnstr[1][e] = tmp;
}
}
free(alnstr[0]);
free(alnstr[1]);
free(alnstr[2]);
}
static inline void striped_seqedit_set_query_prof(u1i *qseq, u4i qlen, u4i bandwidth, u8i *qprof){
u8i *qp, *pq;
u4i xlen, x, pos, j, W;
W = bandwidth / 64;
xlen = num_max(qlen, bandwidth);
memset(qprof, 0, 4 * (xlen + 1) * 8);
#if 1
for(x=0;x<W;x++){
qp = qprof + 4 * x;
for(j=0;j<64;j++){
pos = x + j * W; if(pos < qlen){
qp[qseq[pos]] |= 1LLU << j;
}
}
}
for(;x<=xlen;x++){
qp = qprof + 4 * x;
pq = qp - 4 * W;
qp[0] = pq[0] >> 1;
qp[1] = pq[1] >> 1;
qp[2] = pq[2] >> 1;
qp[3] = pq[3] >> 1;
pos = x + 63 * W;
if(pos < qlen){
qp[qseq[pos]] |= 1LLU << 63; }
}
#else#endif
}
static inline void striped_seqedit_row_init(u8i *us[2], u4i W){
memset(us[0], 0x00, W * sizeof(u8i));
memset(us[1], 0xFF, W * sizeof(u8i));
}
static inline void striped_seqedit_row_movx(u8i *us[2][2], u4i W, u4i movx, int mode, int *sbeg){
u8i *p1, *p2, *p3, *p4, MASK;
u4i i, mov, div, cyc;
if(seqalign_mode_type(mode) == SEQALIGN_MODE_OVERLAP){
sbeg[0] = 0;
memcpy(us[0][0], us[1][0], W * sizeof(u8i));
memcpy(us[0][1], us[1][1], W * sizeof(u8i));
return;
}
if(movx == 0){
sbeg[0] ++;
} else {
mov = num_min(movx, W * 64);
for(i=0;i<mov;i++){
sbeg[0] -= striped_seqedit_getval(us[1][0], W, i);
sbeg[0] += striped_seqedit_getval(us[1][1], W, i);
}
sbeg[0] ++;
}
if(movx == 0){
memcpy(us[0][0], us[1][0], W * sizeof(u8i));
memcpy(us[0][1], us[1][1], W * sizeof(u8i));
return;
}
if(movx >= W * 64){
memset(us[0][0], 0x00, W * sizeof(u8i));
memset(us[0][1], 0xFF, W * sizeof(u8i));
return;
}
cyc = movx / W;
mov = movx % W;
div = W - mov;
if(cyc){
MASK = 0xFFFFFFFFFFFFFFFFLLU << (64 - cyc);
p1 = us[0][0];
p2 = us[1][0] + mov;
p3 = us[0][1];
p4 = us[1][1] + mov;
for(i=0;i<div;i++){
p1[i] = p2[i] >> cyc;
p3[i] = (p4[i] >> cyc) | MASK;
}
} else {
memcpy(us[0][0], us[1][0] + mov, div * sizeof(u8i));
memcpy(us[0][1], us[1][1] + mov, div * sizeof(u8i));
}
p1 = us[0][0] + div;
p2 = us[1][0];
p3 = us[0][1] + div;
p4 = us[1][1];
if(cyc){
MASK = 0xFFFFFFFFFFFFFFFFLLU << (64 - cyc - 1);
for(i=div;i<W;i++){
p1[i] = p2[i] >> (cyc + 1);
p3[i] = (p4[i] >> (cyc + 1)) | MASK;
}
} else {
MASK = 0xFFFFFFFFFFFFFFFFLLU << (64 - 1);
for(i=0;i<mov;i++){
p1[i] = p2[i] >> 1;
p3[i] = (p4[i] >> 1) | MASK;
}
}
}
static inline void striped_seqedit_row_cal(u4i rbeg, u8i *us[2][2], u8i *hs, u8i *qprof, int mode, u4i W, u1i base){
u8i s, u1, u2, u3, u4, v1, v2, h, h2;
u4i i, running;
v1 = 0x0000000000000000LLU;
v2 = (seqalign_mode_type(mode) == SEQALIGN_MODE_OVERLAP)? 0x0000000000000000LLU : 0xFFFFFFFFFFFFFFFFLLU;
for(i=0;__builtin_expect(i<W, 0);i++){
s = qprof[(rbeg + i) * 4 + base];
u1 = us[0][0][i];
u2 = us[0][1][i];
h = ~(s | u1 | v1);
u3 = (~h) & v2;
u4 = v2 ^ (h | v1 | v2);
v1 = (~h) & u2;
v2 = u2 ^ (h | u1 | u2);
hs[i] = h;
us[1][0][i] = u3;
us[1][1][i] = u4;
}
running = 1;
while(running){ v1 <<= 1;
v2 <<= 1;
if(seqalign_mode_type(mode) != SEQALIGN_MODE_OVERLAP){
v2 |= 1;
}
for(i=0;i<W;i++){
s = qprof[(rbeg + i) * 4 + base];
h2 = hs[i];
u1 = us[0][0][i];
u2 = us[0][1][i];
h = ~(s | u1 | v1);
u3 = (~h) & v2;
u4 = v2 ^ (h | v1 | v2);
v1 = (~h) & u2;
v2 = u2 ^ (h | u1 | u2);
hs[i] = h;
us[1][0][i] = u3;
us[1][1][i] = u4;
if(h == h2){
running = 0;
break;
}
}
}
}
static inline int striped_seqedit_rowmin(int _sbeg, u8i *us[2], u4i W, u4i *whence){
xint h, u1, u2, u3, m, c, p, d, hh[4], mm[4], pp[4], MASK1, MASK2, ONES;
u8i ux[2];
u4i i, blk, ib, ie, pmin;
int xs[16], sc, st, sbeg, smin;
b1i _mask1[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
b1i _mask2[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80};
#ifdef DEBUG_ROWMIN
u4i j;
int abc[1024];
b1i tmp[16];
#endif
sbeg = _sbeg;
smin = sbeg; pmin = 0;
MASK1 = mm_load((xint*)_mask1);
MASK2 = mm_load((xint*)_mask2);
ONES = mm_set1_epi8(1);
for(blk=0;blk<4;blk++){
hh[0] = hh[1] = hh[2] = hh[3] = mm_setzero();
mm[0] = mm[1] = mm[2] = mm[3] = mm_setzero();
pp[0] = pp[1] = pp[2] = pp[3] = mm_setzero();
for(ib=0;ib<W;ib=ie){
ie = num_min(ib + 124, W);
h = mm_setzero();
m = mm_setzero();
p = mm_setzero();
for(i=ib;i<ie;i++){
ux[0] = (us[0][i] >> ((blk) << 4)) & 0xFFFFU;
ux[1] = ux[0] >> 8;
ux[0] = ux[0] & 0xFF;
u3 = mm_set1_epi8(ux[0]);
u1 = mm_and(u3, MASK1);
u3 = mm_set1_epi8(ux[1]);
u3 = mm_and(u3, MASK2);
u1 = mm_or(u1, u3);
u1 = mm_cmpeq_epi8(u1, mm_setzero());
u1 = mm_andnot(u1, ONES);
#ifdef DEBUG_ROWMIN
mm_store((xint*)tmp, u1);
for(j=0;j<16;j++){
if((int)striped_seqedit_getval(us[0], W, i + (blk * 16 + j) * W) != tmp[j]){
fflush(stdout); fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
}
#endif
ux[0] = (us[1][i] >> ((blk) << 4)) & 0xFFFFU;
ux[1] = ux[0] >> 8;
ux[0] = ux[0] & 0xFF;
u3 = mm_set1_epi8(ux[0]);
u2 = mm_and(u3, MASK1);
u3 = mm_set1_epi8(ux[1]);
u3 = mm_and(u3, MASK2);
u2 = mm_or(u2, u3);
u2 = mm_cmpeq_epi8(u2, mm_setzero());
u2 = mm_andnot(u2, ONES);
#ifdef DEBUG_ROWMIN
mm_store((xint*)tmp, u2);
for(j=0;j<16;j++){
if((int)striped_seqedit_getval(us[1], W, i + (blk * 16 + j) * W) != tmp[j]){
fflush(stdout); fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
}
#endif
h = mm_subs_epi8(h, u1);
h = mm_adds_epi8(h, u2);
c = mm_cmpgt_epi8(m, h);
d = mm_set1_epi8(i - ib);
p = mm_blendv(p, d, c);
m = mm_min_epi8(m, h);
}
for(i=0;i<4;i++){
d = mm_add_epi32(hh[i], mm_cvtepi8x0_epi32(m));
c = mm_cmpgt_epi32(mm[i], d);
mm[i] = mm_min_epi32(mm[i], d);
pp[i] = mm_blendv(pp[i], mm_add_epi32(mm_cvtepi8x0_epi32(p), mm_set1_epi32(ib)), c);
hh[i] = mm_add_epi32(hh[i], mm_cvtepi8x0_epi32(h));
h = mm_srli(h, 4);
m = mm_srli(m, 4);
p = mm_srli(p, 4);
}
}
for(i=0;i<4;i++){
mm_store(((xint*)xs) + i, hh[i]);
}
for(i=0;i<16;i++){
sc = xs[i];
xs[i] = sbeg;
sbeg += sc;
}
#ifdef DEBUG_ROWMIN
for(i=0;i<16;i++){
abc[16 * blk + i] = xs[i];
fprintf(stderr, "SSE[%d] = %d\n", 16 * blk + i, xs[i]);
}
#endif
for(i=0;i<4;i++){
mm_store(hh + i, ((xint*)xs)[i]);
mm[i] = mm_add_epi32(hh[i], mm[i]);
mm_store(((xint*)xs) + i, mm[i]);
}
sc = xs[0];
st = 0;
for(i=1;i<16;i++){
if(sc > xs[i]){
sc = xs[i];
st = i;
}
}
if(sc >= smin) continue;
smin = sc;
mm_store(((xint*)xs), pp[st / 4]);
pmin = (blk * 16 + st) * W + xs[st % 4];
}
#ifdef DEBUG_ROWMIN
{
int ls = 0;
sc = st = _sbeg;
ib = 0;
for(i=0;i<W*64;i++){
if((i % W) == 0){
fprintf(stderr, "[%d] %d %d\n", i / W, st, st - ls);
if(st != abc[i / W]){
fflush(stdout); fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
ls = st;
}
st -= striped_seqedit_getval(us[0], W, i);
st += striped_seqedit_getval(us[1], W, i);
if(st < sc){
sc = st;
ib = i;
}
}
if(sc != smin){
fflush(stdout); fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
if(ib != pmin){
fflush(stdout); fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
fflush(stdout); fprintf(stderr, " -- smin=%d pmin=%d in %s -- %s:%d --\n", smin, pmin, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
}
#endif
if(whence) whence[0] = pmin;
return smin;
}
static inline seqalign_result_t striped_seqedit_backtrace(u8i *uts[2], u4i *begs, u4i W, u1i *qseq, int x, u1i *tseq, int y, int mode, u4v *cigars){
seqalign_result_t rs;
u4i cg, op, cgz;
int u1, u2, u3, u4;
ZEROS(&rs);
rs.qe = x + 1;
rs.te = y + 1;
cgz = 0;
if(cigars){
if(mode & SEQALIGN_MODE_CIGRESV){
cgz = cigars->size;
} else {
clear_u4v(cigars);
}
}
cg = op = 0;
while(x >= 0 && y >= 0){
if(qseq[x] == tseq[y]){
rs.mat ++;
op = 0;
x --;
y --;
} else {
u3 = striped_seqedit_getval(uts[0] + (y + 1) * W, W, x - begs[y + 1]);
u4 = striped_seqedit_getval(uts[1] + (y + 1) * W, W, x - begs[y + 1]);
if(u3 == 0 && u4 == 1){ rs.ins ++;
op = 1; x --;
} else {
u1 = striped_seqedit_getval(uts[0] + (y + 0) * W, W, x - begs[y + 0]);
u2 = striped_seqedit_getval(uts[1] + (y + 0) * W, W, x - begs[y + 0]);
if(u1 == 1 && u2 == 0){
rs.del ++;
op = 2; y --;
} else {
rs.mis ++;
op = 0;
x --;
y --;
}
}
}
if(op == (cg & 0xf)){
cg += 0x10;
} else {
if(cg && cigars) push_u4v(cigars, cg);
cg = 0x10 | op;
}
}
rs.qb = x + 1;
rs.tb = y + 1;
if(rs.qb){
op = 1;
if(op == (cg & 0xf)){
cg += 0x10 * rs.qb;
} else {
if(cg && cigars) push_u4v(cigars, cg);
cg = (0x10 * rs.qb) | op;
}
rs.ins += rs.qb;
rs.qb = 0;
}
if((seqalign_mode_type(mode) == SEQALIGN_MODE_GLOBAL || seqalign_mode_type(mode) == SEQALIGN_MODE_EXTEND) && rs.tb){
op = 2;
if(op == (cg & 0xf)){
cg += 0x10 * rs.tb;
} else {
if(cg && cigars) push_u4v(cigars, cg);
cg = (0x10 * rs.tb) | op;
}
rs.del += rs.tb;
rs.tb = 0;
}
rs.aln = rs.mat + rs.mis + rs.ins + rs.del;
if(cg && cigars) push_u4v(cigars, cg);
if(cigars) sub_reverse_u4v(cigars, cgz, cigars->size);
return rs;
}
static inline seqalign_result_t striped_seqedit_pairwise(u1i *qseq, u4i qlen, u1i *tseq, u4i tlen, int mode, u4i bandwidth, b1v *mempool, u4v *cigars, int verbose){
seqalign_result_t rs;
u8i *memp, *mempb, mpsize, *qprof, *uts[2], *us[3][2], *hs;
u4i i, k, W, movx, *begs, rbeg[2];
int rx, ry, smin, srow, sbeg;
if(qlen == 0 || tlen == 0){
memset(&rs, 0, sizeof(seqalign_result_t));
return rs;
}
if(seqalign_mode_type(mode) == SEQALIGN_MODE_OVERLAP || seqalign_mode_type(mode) == SEQALIGN_MODE_EXTEND){ bandwidth = roundup_times(qlen, 64);
} else {
bandwidth = roundup_times(bandwidth, 64);
if(bandwidth == 0 || bandwidth > qlen){
bandwidth = roundup_times(qlen, 64);
}
if(bandwidth < qlen){
if(bandwidth < ((qlen + tlen - 1) / tlen) + 1){ bandwidth = roundup_times((qlen + tlen - 1) / tlen + 1, 64);
}
}
}
W = bandwidth / 64;
mpsize = 8;
mpsize += striped_seqedit_qprof_size(qlen, bandwidth); mpsize += 2 * W * 8 * ((tlen + 1)); mpsize += 2 * W * 8; mpsize += W * 8; mpsize += (tlen + 1) * 4;
if(mempool){
if(mempool->aligned < WORDSIZE){
fflush(stdout); fprintf(stderr, " -- mempool should be aligned by (%d) but (%d) bytes in %s -- %s:%d --\n", WORDSIZE, mempool->aligned, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
encap_b1v(mempool, mpsize);
memp = (u8i*)(mempool->buffer + mempool->size + 8);
mempb = NULL;
} else {
mempb = (u8i*)malloc(mpsize);
memp = mempb + 1;
}
qprof = memp; memp += striped_seqedit_qprof_size(qlen, bandwidth) / 8;
uts[0] = memp; memp += W * (tlen + 1);
uts[1] = memp; memp += W * (tlen + 1);
us[2][0] = memp; memp += W;
us[2][1] = memp; memp += W;
hs = memp; memp += W;
begs = (u4i*)memp;
if(mode & SEQALIGN_MODE_QPROF){
} else {
striped_seqedit_set_query_prof(qseq, qlen, bandwidth, qprof);
}
us[1][0] = uts[0];
us[1][1] = uts[1];
rx = qlen - 1;
ry = tlen - 1;
smin = MAX_B4;
sbeg = 0;
striped_seqedit_row_init(us[1], W);
rbeg[0] = rbeg[1] = 0;
begs[0] = 0;
for(i=0;i<tlen;i++){
if(seqalign_mode_type(mode) == SEQALIGN_MODE_OVERLAP || seqalign_mode_type(mode) == SEQALIGN_MODE_EXTEND){
rbeg[1] = 0;
} else {
rbeg[1] = ((u8i)i * qlen) / tlen;
rbeg[1] = (rbeg[1] < bandwidth / 2)? 0 : rbeg[1] - bandwidth / 2;
rbeg[1] = (rbeg[1] + bandwidth > roundup_times(qlen, 64))? roundup_times(qlen, 64) - bandwidth : rbeg[1];
}
begs[i + 1] = rbeg[1];
movx = rbeg[1] - rbeg[0];
us[0][0] = us[2][0];
us[0][1] = us[2][1];
striped_seqedit_row_movx(us, W, movx, mode, &sbeg);
us[1][0] = uts[0] + (i + 1) * W;
us[1][1] = uts[1] + (i + 1) * W;
striped_seqedit_row_cal(rbeg[1], us, hs, qprof, mode, W, tseq[i]);
if(seqalign_mode_type(mode) == SEQALIGN_MODE_OVERLAP || seqalign_mode_type(mode) == SEQALIGN_MODE_EXTEND){
srow = sbeg;
for(k=0;k<W;k++){
srow -= __builtin_popcountll(us[1][0][k]);
srow += __builtin_popcountll(us[1][1][k]);
}
for(k=rbeg[1]+bandwidth;k>qlen;k--){
srow += striped_seqedit_getval(us[1][0], W, k - 1 - rbeg[1]);
srow -= striped_seqedit_getval(us[1][1], W, k - 1 - rbeg[1]);
}
if(srow < smin){
smin = srow;
rx = qlen - 1;
ry = i;
}
}
if(verbose){
int vals[2][2] = {{0, 1}, {-1, 2}};
int j, b1, b2, b3, b4, u, u2, v, v2, score, error;
score = sbeg;
v2 = seqalign_mode_type(mode) == SEQALIGN_MODE_OVERLAP? 0 : 1;
error = 0;
if(verbose > 1){
fprintf(stdout, "[%04d:%c] rbeg=%d\tmov=%d\t", i, "ACGTN"[tseq[i]], rbeg[1], movx);
}
for(j=0;j<(int)num_min(qlen-rbeg[1], bandwidth);j++){
b1 = striped_seqedit_getval(us[0][0], W, j);
b2 = striped_seqedit_getval(us[0][1], W, j);
u = vals[b1][b2];
v = v2;
if(qseq[rbeg[1] + j] == tseq[i] || u == -1 || v == -1){
u2 = 0 - v;
v2 = 0 - u;
} else {
u2 = 1 - v;
v2 = 1 - u;
}
b3 = striped_seqedit_getval(us[1][0], W, j);
b4 = striped_seqedit_getval(us[1][1], W, j);
if(error == 0 && u2 != vals[b3][b4]){
fprintf(stdout, "\n"); fflush(stdout);
fprintf(stderr, "\n -- i=%d j=%d s=%c%c u=%d v=%d u2=%d should be %d in %s -- %s:%d --\n", i, j, "ACGTN"[qseq[rbeg[1] + j]], "ACGTN"[tseq[i]], u, v, vals[b3][b4], u2, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
error = 1;
}
if(b3 == 0 && b4 == 1) score ++;
else if(b3 == 1 && b4 == 0) score --;
if(verbose > 1){
fprintf(stdout, "%c%03d:%c:%c ", "ACGTN"[qseq[rbeg[1] + j]], score, "-*+"[vals[b3][b4] + 1], "-*+"[v2 + 1]);
}
}
if(verbose > 1){
fprintf(stdout, "\n");
}
}
rbeg[0] = rbeg[1];
}
if(seqalign_mode_type(mode) == SEQALIGN_MODE_EXTEND){
srow = striped_seqedit_rowmin(sbeg, us[1], W, &k);
if(srow < smin){
smin = srow;
rx = k;
ry = tlen - 1;
}
}
rs = striped_seqedit_backtrace(uts, begs, W, qseq, rx, tseq, ry, mode, cigars);
if(seqalign_mode_type(mode) == SEQALIGN_MODE_OVERLAP){
rs.score = smin + rs.te - rs.tb;
} else if(seqalign_mode_type(mode) == SEQALIGN_MODE_EXTEND){
rs.score = smin;
} else {
rs.score = sbeg;
for(i=0;i<W;i++){
rs.score -= __builtin_popcountll(us[1][0][i]);
rs.score += __builtin_popcountll(us[1][1][i]);
}
for(i=rbeg[1]+bandwidth;i>qlen;i--){
rs.score += striped_seqedit_getval(us[1][0], W, i - 1 - rbeg[1]);
rs.score -= striped_seqedit_getval(us[1][1], W, i - 1 - rbeg[1]);
}
}
if(mempb) free(mempb);
return rs;
}
static inline seqalign_result_t kmer_striped_seqedit_pairwise(u1i ksz, u1i *qseq, u4i qlen, u1i *tseq, u4i tlen, b1v *mempool, u4v *cigars, int verbose){
seqalign_result_t RS, RS2;
u8i *maps;
u4i i, kmap, cmin, mode, ml, qb, qe, tb, te, khf;
kmap = 0;
maps = NULL;
if(ksz > 15) ksz = 15;
clear_b1v(mempool);
do {
cmin = num_min(qlen, tlen) * 0.05 + 1;
cmin = num_min(cmin, 2 * ksz);
typedef struct { u4i kflg:1, kmer:30, kdir:1, koff; } _tmp_kmer_t;
typedef struct { _tmp_kmer_t kmers[2]; } _tmp_khit_t;
_tmp_kmer_t *kmers, *k;
_tmp_khit_t *khits, *h;
u4i flg, b, e, m, xlen, dir, kvals[2], kmk, sft, kcnt, *lisx[2];
int tot, mean, median, delta, var, *xoffs;
u1i *xseq;
kmk = MAX_U4 >> ((16 - ksz) << 1);
sft = (ksz - 1) << 1;
encap_b1v(mempool, (qlen + tlen) * sizeof(_tmp_kmer_t));
kmers = (_tmp_kmer_t*)(mempool->buffer + mempool->size);
kcnt = 0;
for(flg=0;flg<2;flg++){
xseq = flg? tseq : qseq;
xlen = flg? tlen : qlen;
kvals[0] = kvals[1] = 0;
for(i=0;i+1<ksz;i++){
b = xseq[i];
kvals[0] = (kvals[0] << 2) | b;
kvals[1] = (kvals[1] >> 2) | (((~b) & 0x03) << sft);
}
for(;i<xlen;i++){
b = xseq[i];
kvals[0] = ((kvals[0] << 2) | b) & kmk;
kvals[1] = (kvals[1] >> 2) | (((~b) & 0x03) << sft);
dir = kvals[1] < kvals[0];
k = kmers + (kcnt ++);
k->kflg = flg;
k->kmer = kvals[dir];
k->kdir = dir;
k->koff = i + 1 - ksz;
}
}
sort_array(kmers, kcnt, _tmp_kmer_t, num_cmpgt(a.kmer, b.kmer));
ZEROS(kmers + kcnt);
xlen = 0;
for(b=i=0;i<=kcnt;i++){
if(kmers[i].kmer == kmers[b].kmer){
continue;
}
if(i - b != 2 || kmers[b].kflg == kmers[b+1].kflg || kmers[b].kdir != kmers[b+1].kdir){
b = i;
continue;
} else if(kmers[b].kflg > kmers[b + 1].kflg){
swap_var(kmers[b], kmers[b+1]);
}
if(xlen < b){
kmers[xlen++] = kmers[b++];
kmers[xlen++] = kmers[b++];
} else {
xlen += 2;
}
b = i;
}
kcnt = xlen / 2;
khits = (_tmp_khit_t*)kmers;
if(kcnt * ksz < cmin) break;
encap_b1v(mempool, kcnt * sizeof(_tmp_khit_t) + 2 * kcnt * sizeof(u4i));
khits = (_tmp_khit_t*)(mempool->buffer + mempool->size);
lisx[0] = (u4i*)(khits + kcnt);
lisx[1] = lisx[0] + kcnt;
sort_array(khits, kcnt, _tmp_khit_t, num_cmpgt(a.kmers[0].koff, b.kmers[0].koff));
lisx[0][0] = 0;
lisx[1][0] = MAX_U4;
xlen = 1;
for(i=1;i<kcnt;i++){
h = khits + i;
e = xlen - 1;
if(h->kmers[1].koff > khits[lisx[0][e]].kmers[1].koff){
lisx[1][i] = lisx[0][e];
lisx[0][xlen++] = i;
} else if(h->kmers[1].koff <= khits[lisx[0][0]].kmers[1].koff){
lisx[1][i] = MAX_U4;
lisx[0][0] = i;
} else {
b = 0; e = xlen;
while(b < e){
m = b + ((e - b) >> 1);
if(h->kmers[1].koff > khits[lisx[0][m]].kmers[1].koff){
b = m + 1;
} else if(h->kmers[1].koff < khits[lisx[0][m]].kmers[1].koff){
e = m;
} else {
b = m;
break;
}
}
lisx[1][i] = lisx[1][lisx[0][b - 1]];
lisx[0][b] = i;
}
}
b = 0; e = MAX_U4;
m = lisx[0][xlen-1];
while(m != MAX_U4){
h = khits + m;
h->kmers[0].kflg = 1;
if(h->kmers[1].koff + ksz <= e){
b += ksz;
} else {
b += e - h->kmers[1].koff;
}
e = h->kmers[1].koff;
m = lisx[1][m];
}
if(b < cmin){
break;
}
#ifdef LOCAL_DEBUG
for(i=1;i<kcnt;i++){
if(khits[i].kmers[0].koff >= qlen){
fflush(stdout); fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
if(khits[i].kmers[0].koff <= khits[i-1].kmers[0].koff){
fflush(stdout); fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
}
#endif
xoffs = (int*)lisx[0];
while(1){
tot = e = 0;
for(i=0;i<kcnt;i++){
if(khits[i].kmers[0].kflg == 0) continue;
delta = Int(khits[i].kmers[0].koff) - Int(khits[i].kmers[1].koff);
tot += delta;
xoffs[e++] = delta;
}
if(e * ksz < cmin) break;
mean = tot / Int(e);
median = quick_median_array(xoffs, e, int, num_cmpgt(a, b));
#ifdef LOCAL_DEBUG
void my_print_khits(){
u4i i, j, off;
off = 0;
for(i=j=0;i<kcnt;i++){
if(khits[i].kmers[0].kflg == 0) continue;
fprintf(stderr, "KHITS[%d]\t%d\t%d\t%d\t%d\n", j, khits[i].kmers[0].koff, khits[i].kmers[1].koff, khits[i].kmers[0].koff - off, Int(khits[i].kmers[0].koff) - Int(khits[i].kmers[1].koff));
off = khits[i].kmers[0].koff;
j ++;
}
fprintf(stderr, "--- END KHITS ---\n");
}
my_print_khits();
#endif
var = num_abs(median - mean) * 3;
var = num_max(var, 50);
#ifdef LOCAL_DEBUG
fprintf(stderr, "Filter VAR: %d\n", var);
#endif
for(i=b=0;i<kcnt;i++){
if(khits[i].kmers[0].kflg == 0) continue;
delta = Int(khits[i].kmers[0].koff) - Int(khits[i].kmers[1].koff);
#ifdef LOCAL_DEBUG
fprintf(stderr, "KHITS\t%d\t%d\t%d\t%d\n", khits[i].kmers[0].koff, khits[i].kmers[1].koff, delta, num_abs(delta - mean));
#endif
if(num_abs(delta - mean) > var){
khits[i].kmers[0].kflg = 0;
b ++;
}
}
#ifdef LOCAL_DEBUG
fprintf(stderr, "KHITS STAT: num=%d mean=%d median=%d filter=%d\n", e, mean, median, b);
#endif
if(b == 0) break;
}
for(b=i=0;i<kcnt;i++){
if(khits[i].kmers[0].kflg == 0) continue;
if(b < i){
khits[b] = khits[i];
}
b ++;
}
kcnt = b;
m = 0;
e = 0;
for(i=0;i<kcnt;i++){
h = khits + i;
if(h->kmers[1].koff >= e + ksz){
m += ksz;
} else {
m += h->kmers[1].koff + ksz - e;
}
e = h->kmers[1].koff + ksz;
}
if(m < cmin){
break;
}
kmap = kcnt;
maps = (u8i*)khits;
#ifdef LOCAL_DEBUG
for(i=1;i<kmap;i++){
if(khits[i].kmers[0].koff >= qlen){
fflush(stdout); fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
if(khits[i].kmers[0].koff <= khits[i-1].kmers[0].koff){
fflush(stdout); fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
}
#endif
for(i=0;i<kmap;i++){
maps[i] = (((u8i)khits[i].kmers[0].koff) << 32) | (khits[i].kmers[1].koff);
}
} while(0);
clear_u4v(cigars);
if(kmap == 0){
return striped_seqedit_pairwise(qseq, qlen, tseq, tlen, SEQALIGN_MODE_GLOBAL, 0, mempool, cigars, verbose);
}
#ifdef LOCAL_DEBUG
for(i=1;i<kmap;i++){
if((maps[i] >> 32) >= qlen){
fflush(stdout); fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
if((maps[i] >> 32) <= (maps[i-1] >> 32)){
fflush(stdout); fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
}
#endif
encap_b1v(mempool, roundup_times(kmap * sizeof(u8i), WORDSIZE));
mempool->size = roundup_times(kmap * sizeof(u8i), WORDSIZE);
maps = (u8i*)mempool->buffer;
qb = tb = ml = 0;
khf = (ksz - 1) / 2;
ZEROS(&RS);
mode = SEQALIGN_MODE_KMER;
#ifdef LOCAL_DEBUG
u4v *cigs = init_u4v(64);
#endif
for(i=0;i<=kmap;i++){
if(i == kmap){
qe = qlen;
te = tlen;
mode = SEQALIGN_MODE_EXTEND;
} else {
#ifdef LOCAL_DEBUG
fprintf(stderr, "[KHIT] %d\t%d\n", Int(maps[i] >> 32), Int(maps[i]));
#endif
qe = (maps[i] >> 32) + (ksz / 2);
te = maps[i] + (ksz / 2);
ml ++;
}
if(qb == qe && tb == te){
} else {
if(ml){
_push_cigar_u4v(cigars, SEQALIGN_CIGAR_M, ml);
RS.mat += ml;
RS.aln += ml;
#ifdef LOCAL_DEBUG
fprintf(stderr, "[KMAP] %d kmer matching\n", ml);
#endif
ml = 0;
}
#ifdef LOCAL_DEBUG
fprintf(stderr, "[KMAP] mode%d %d-%d:%d\t%d-%d:%d\t%d\n", mode, qb, qe, qe - qb, tb, te, te - tb, Int(qe) - Int(te));
clear_u4v(cigs);
#endif
if(mode == SEQALIGN_MODE_KMER){
reverse_array(qseq, qe, u1i);
reverse_array(tseq, te, u1i);
RS2 = striped_seqedit_pairwise(qseq + qb, qe - qb, tseq + tb, te - tb, SEQALIGN_MODE_EXTEND | SEQALIGN_MODE_CIGRESV, 0, mempool, cigars, verbose);
reverse_array(qseq, qe, u1i);
reverse_array(tseq, te, u1i);
RS.qb = qe - RS2.qe;
RS.tb = te - RS2.te;
RS.qe = qe;
RS.te = te;
reverse_u4v(cigars);
#ifdef LOCAL_DEBUG
seqalign_cigar2alnstr_print("Q", qseq + RS.qb, "T", tseq + RS.tb, &RS2, cigars, stderr);
#endif
} else {
#ifdef LOCAL_DEBUG
RS2 = striped_seqedit_pairwise(qseq + qb, qe - qb, tseq + tb, te - tb, mode | SEQALIGN_MODE_CIGRESV, 0, mempool, cigs, verbose);
seqalign_cigar2alnstr_print("Q", qseq + qb, "T", tseq + tb, &RS2, cigs, stderr);
u4i j;
for(j=0;j<cigs->size;j++){
_push_cigar_u4v(cigars, cigs->buffer[j] & 0xF, cigs->buffer[j] >> 4);
}
#else
RS2 = striped_seqedit_pairwise(qseq + qb, qe - qb, tseq + tb, te - tb, mode | SEQALIGN_MODE_CIGRESV, 0, mempool, cigars, verbose);
#endif
RS.qe = qb + RS2.qe;
RS.te = tb + RS2.te;
}
maps = (u8i*)mempool->buffer; RS.mat += RS2.mat;
RS.mis += RS2.mis;
RS.ins += RS2.ins;
RS.del += RS2.del;
RS.aln += RS2.aln;
RS.score += RS2.score;
#ifdef LOCAL_DEBUG
seqalign_cigar2alnstr_print("Q", qseq, "T", tseq, &RS, cigars, stderr);
#endif
}
qb = qe + 1;
tb = te + 1;
mode = SEQALIGN_MODE_GLOBAL;
}
#ifdef LOCAL_DEBUG
free_u4v(cigs);
#endif
return RS;
}
static inline void striped_epi2_seqedit_set_query_prof(u1i *qseq, u4i qlen, b1i *qprof){
b1i *qp;
u4i xlen, x, y, pos, i, j, k, W;
xlen = roundup_times(qlen, WORDSIZE * 8);
W = xlen / (WORDSIZE * 8);
memset(qprof, 0, 4 * W * WORDSIZE);
qp = qprof;
for(i=0;i<4;i++){
for(j=0;j<W;j++){
for(k=0;k<WORDSIZE;k++){
y = 8 * k * W + j;
for(x=0;x<8;x++){
pos = y + x * W;
if(pos < qlen && qseq[pos] == i){
qp[0] |= 0x1 << ((x & 0x7));
if(0){
if(striped_epi2_seqedit_getval(qprof + i * W * WORDSIZE, W, pos) != 0x1){
fprintf(stderr, " -- something wrong i=%d j=%d k=%d x=%d pos=%d in %s -- %s:%d --\n", i, j, k, x, pos, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
}
}
}
qp ++;
}
}
}
}
static inline void striped_epi2_seqedit_row_init(b1i *us[2], u4i W, int mode){
xint BIT0, BIT1;
u4i i;
UNUSED(mode);
BIT0 = mm_set1_epi8(0x00);
BIT1 = mm_set1_epi8(0xFF);
for(i=0;i<W;i++){
mm_store(((xint*)us[0]) + i, BIT0);
mm_store(((xint*)us[1]) + i, BIT1);
}
}
static inline void fprint_striped_epi1_word(FILE *out, xint v){
u4i i, j, x;
u1i xs[WORDSIZE];
mm_store((xint*)xs, v);
for(i=0;i<WORDSIZE;i++){
for(j=0;j<8;j++){
x = (xs[i] >> (((j)&0x7))) & 0x1;
fprintf(out, "%d", x);
}
fprintf(out, " ");
}
}
static inline void striped_epi2_seqedit_row_cal(u4i rbeg, b1i *us[2][2], b1i *hs, b1i *qprof, u4i W, u1i base, int mode){
xint s, u1, u2, u3, u4, v1, v2, h, h2, BIT0, BIT1, BMSK;
u4i i, running;
UNUSED(rbeg);
UNUSED(mode);
BIT0 = mm_set1_epi8(0x00);
BIT1 = mm_set1_epi8(0xFF);
BMSK = mm_srli(mm_set1_epi8(0x1), WORDSIZE - 1);
v1 = BIT0;
v2 = BIT1;
for(i=0;i<W;i++){
s = mm_load(((xint*)qprof) + base * W + i);
u1 = mm_load(((xint*)us[0][0]) + i);
u2 = mm_load(((xint*)us[0][1]) + i);
h = mm_andnot(mm_or(s, mm_or(u1, v1)), BIT1);
u3 = mm_andnot(h, v2);
u4 = mm_xor(v2, mm_or(h, mm_or(v1, v2)));
if(0){
fprintf(stdout, "i = %d\n", i);
fprintf(stdout, "s = "); fprint_striped_epi1_word(stdout, s); fprintf(stdout, "\n");
fprintf(stdout, "u1 = "); fprint_striped_epi1_word(stdout, u1); fprintf(stdout, "\n");
fprintf(stdout, "u2 = "); fprint_striped_epi1_word(stdout, u2); fprintf(stdout, "\n");
fprintf(stdout, "v1 = "); fprint_striped_epi1_word(stdout, v1); fprintf(stdout, "\n");
fprintf(stdout, "v2 = "); fprint_striped_epi1_word(stdout, v2); fprintf(stdout, "\n");
fprintf(stdout, "h = "); fprint_striped_epi1_word(stdout, h); fprintf(stdout, "\n");
fprintf(stdout, "u3 = "); fprint_striped_epi1_word(stdout, u3); fprintf(stdout, "\n");
fprintf(stdout, "u4 = "); fprint_striped_epi1_word(stdout, u4); fprintf(stdout, "\n");
}
v1 = mm_andnot(h, u2);
v2 = mm_xor(u2, mm_or(h, mm_or(u1, u2)));
mm_store(((xint*)hs) + i, h);
mm_store(((xint*)us[1][0]) + i, u3);
mm_store(((xint*)us[1][1]) + i, u4);
}
running = 1;
while(running){
v1 = mm_sl1bit(v1);
v2 = mm_sl1bit(v2);
v2 = mm_or(v2, BMSK);
for(i=0;i<W;i++){
s = mm_load(((xint*)qprof) + base * W + i);
h2 = mm_load(((xint*)hs) + i);
u1 = mm_load(((xint*)us[0][0]) + i);
u2 = mm_load(((xint*)us[0][1]) + i);
h = mm_andnot(mm_or(s, mm_or(u1, v1)), BIT1);
u3 = mm_andnot(h, v2);
u4 = mm_xor(v2, mm_or(h, mm_or(v1, v2)));
v1 = mm_andnot(h, u2);
v2 = mm_xor(u2, mm_or(h, mm_or(u1, u2)));
mm_store(((xint*)hs) + i, h);
mm_store(((xint*)us[1][0]) + i, u3);
mm_store(((xint*)us[1][1]) + i, u4);
if(!mm_movemask_epi8(mm_andnot(mm_cmpeq_epi8(h2, h), BIT1))){
running = 0;
break;
}
}
}
}
static inline seqalign_result_t striped_epi2_seqedit_backtrace(b1i *uts[2], u4i W, int mode, u1i *qseq, int x, u1i *tseq, int y, u4v *cigars){
seqalign_result_t rs;
u4i cg, op, cgz;
int u1, u2, u3, u4;
UNUSED(mode);
ZEROS(&rs);
rs.qe = x + 1;
rs.te = y + 1;
cgz = 0;
if(cigars){
if(mode & SEQALIGN_MODE_CIGRESV){
cgz = cigars->size;
} else {
clear_u4v(cigars);
}
}
cg = op = 0;
while(x >= 0 && y >= 0){
if(qseq[x] == tseq[y]){
rs.mat ++;
op = 0;
x --;
y --;
} else {
u3 = striped_epi2_seqedit_getval(uts[0] + Int64(y + 1) * W * WORDSIZE, W, x);
u4 = striped_epi2_seqedit_getval(uts[1] + Int64(y + 1) * W * WORDSIZE, W, x);
if(u3 == 0 && u4 == 1){ rs.ins ++;
op = 1; x --;
} else {
u1 = striped_epi2_seqedit_getval(uts[0] + Int64(y + 0) * W * WORDSIZE, W, x);
u2 = striped_epi2_seqedit_getval(uts[1] + Int64(y + 0) * W * WORDSIZE, W, x);
if(u1 == 1 && u2 == 0){
rs.del ++;
op = 2; y --;
} else {
rs.mis ++;
op = 0;
x --;
y --;
}
}
}
if(op == (cg & 0xf)){
cg += 0x10;
} else {
if(cg && cigars) push_u4v(cigars, cg);
cg = 0x10 | op;
}
}
rs.qb = x + 1;
rs.tb = y + 1;
if(rs.qb){
op = 1;
if(op == (cg & 0xf)){
cg += 0x10 * rs.qb;
} else {
if(cg && cigars) push_u4v(cigars, cg);
cg = (0x10 * rs.qb) | op;
}
rs.ins += rs.qb;
rs.qb = 0;
}
if(rs.tb){
op = 2;
if(op == (cg & 0xf)){
cg += 0x10 * rs.tb;
} else {
if(cg && cigars) push_u4v(cigars, cg);
cg = (0x10 * rs.tb) | op;
}
rs.del += rs.tb;
rs.tb = 0;
}
rs.aln = rs.mat + rs.mis + rs.ins + rs.del;
if(cg && cigars) push_u4v(cigars, cg);
if(cigars) sub_reverse_u4v(cigars, cgz, cigars->size);
return rs;
}
#define striped_epi2_seqedit_row_merge_core(shift)do{ \
xint MASK, SHFT;\
MASK = mm_set1_epi8(1);\
SHFT = mm_set1_epi8((unsigned short)(1 << shift));\
for(i=0;i<W;i++){\
q1 = mm_load(((xint*)us[0][0]) + i);\
p1 = mm_load(((xint*)us[0][1]) + i);\
h1 = mm_adds_epi8(h1, mm_and(mm_srli_epi64(p1, shift), MASK));\
h1 = mm_subs_epi8(h1, mm_and(mm_srli_epi64(q1, shift), MASK));\
q2 = mm_load(((xint*)us[1][0]) + i);\
p2 = mm_load(((xint*)us[1][1]) + i);\
h2 = mm_adds_epi8(h2, mm_and(mm_srli_epi64(p2, shift), MASK));\
h2 = mm_subs_epi8(h2, mm_and(mm_srli_epi64(q2, shift), MASK));\
h4 = mm_min_epi8(h1, h2);\
u1 = mm_load(((xint*)us[2][0]) + i);\
u2 = mm_load(((xint*)us[2][1]) + i);\
u1 = mm_or(u1, mm_and(mm_cmpgt_epi8(h3, h4), SHFT));\
u2 = mm_or(u2, mm_and(mm_cmpgt_epi8(h4, h3), SHFT));\
h3 = h4;\
mm_store(((xint*)us[2][0]) + i, u1);\
mm_store(((xint*)us[2][1]) + i, u2);\
}\
}while(0)
static inline void striped_seqedit_sum_4bits(u1i *dst, b1i *src, u4i W){
u8i *s, t[WORDSIZE/8], *r, v;
u4i i, k, l, j;
memset(dst, 0, WORDSIZE);
for(k=0;k<W;k+=3){ l = num_min(k + 3, W);
memset(t, 0, WORDSIZE);
for(i=k;i<l;i++){
s = (u8i*)(src + i * WORDSIZE);
for(j=0;j<WORDSIZE/8;j++){ v = s[j] - ((s[j] >> 1) & 0x5555555555555555LLU);
v = (v & 0x3333333333333333LLU) + ((v >> 2) & 0x3333333333333333LLU);
t[j] += v;
}
}
for(j=0;j<WORDSIZE/8;j++){
r = ((u8i*)dst) + j * 2;
v = ((t[j] << 16) & 0x0000FFFF00000000LLU) | (t[j] & 0x000000000000FFFFLLU);
v = ((v << 8) | v) & 0x00FF00FF00FF00FFLLU;
v = ((v << 4) | v) & 0x0F0F0F0F0F0F0F0FLLU;
r[0] += v;
v = ((t[j] >> 16) & 0x0000FFFF00000000LLU) | ((t[j] >> 32) & 0x000000000000FFFFLLU);
v = ((v << 8) | v) & 0x00FF00FF00FF00FFLLU;
v = ((v << 4) | v) & 0x0F0F0F0F0F0F0F0FLLU;
r[1] += v;
}
}
}
static inline void striped_epi2_seqedit_row_merge(u2i sbegs[2], b1i *us[3][2], b1i *hs[3], u1i W){
xint p1, p2, q1, q2, u1, u2, h1, h2, h3, h4;
u4i i;
b2i ts[3];
u1i rs[4][WORDSIZE];
#ifdef CHECK_NO_SSE
#define DEBUG_NO_SSE
#endif
#ifdef CHECK_NO_SSE
int ds[2][128];
memset(ds[0], 0, 128 * sizeof(int));
memset(ds[1], 0, 128 * sizeof(int));
#endif
if(W > 63){
W = 63;
}
striped_seqedit_sum_4bits(rs[0], us[0][0], W);
striped_seqedit_sum_4bits(rs[1], us[0][1], W);
striped_seqedit_sum_4bits(rs[2], us[1][0], W);
striped_seqedit_sum_4bits(rs[3], us[1][1], W);
ts[0] = sbegs[0];
ts[1] = sbegs[1];
for(i=0;i<WORDSIZE*2;i++){
ts[2] = (ts[0] + ts[1]) / 2;
ts[0] -= ts[2];
ts[1] -= ts[2];
if(ts[0] <= ts[1]){
hs[0][i] = ts[0] < -7? -7 : ts[0];
hs[1][i] = ts[1] > 7? 7 : ts[1];
} else {
hs[0][i] = ts[0] > 7? 7 : ts[0];
hs[1][i] = ts[1] < -7? -7 : ts[1];
}
ts[0] += rs[0][i];
ts[1] += rs[1][i];
}
{
h1 = mm_load(((xint*)hs[0]));
h2 = mm_load(((xint*)hs[1]));
h4 = mm_min_epi16(h1, h2);
mm_store(((xint*)hs[2]), h4);
}
#ifdef CHECK_NO_SSE
void sub_print_3hs(int tag){
UNUSED(tag);
#ifdef DEBUG_NO_SSE
fprintf(stdout, "--- HS %d ---\n", tag);
fprintf(stdout, "hs1");
for(j=0;j<WORDSIZE*8;j++){
k = (((j) & 0xf) << (WORDSHIFT - 1)) + ((j) >> 4);
fprintf(stdout, "\t%d", hs[0][k]);
}
fprintf(stdout, "\n");
fprintf(stdout, "hs2");
for(j=0;j<WORDSIZE*8;j++){
k = (((j) & 0xf) << (WORDSHIFT - 1)) + ((j) >> 4);
fprintf(stdout, "\t%d", hs[1][k]);
}
fprintf(stdout, "\n");
fprintf(stdout, "hs3");
for(j=0;j<WORDSIZE*8;j++){
k = (((j) & 0xf) << (WORDSHIFT - 1)) + ((j) >> 4);
fprintf(stdout, "\t%d", hs[2][k]);
}
fprintf(stdout, "\n");
#endif
for(j=0;j<WORDSIZE*8;j++){
k = (((j) & 0xf) << (WORDSHIFT - 1)) + ((j) >> 4);
if(hs[2][k] != num_min(hs[0][k], hs[1][k])){
fflush(stdout); fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
}
}
sub_print_3hs(-1);
#endif
striped_epi2_seqedit_row_merge_core(0);
striped_epi2_seqedit_row_merge_core(1);
striped_epi2_seqedit_row_merge_core(2);
striped_epi2_seqedit_row_merge_core(3);
striped_epi2_seqedit_row_merge_core(4);
striped_epi2_seqedit_row_merge_core(5);
striped_epi2_seqedit_row_merge_core(6);
striped_epi2_seqedit_row_merge_core(7);
#ifdef CHECK_NO_SSE
sub_print_3hs(i);
int score[4], b1, b2;
score[0] = sbegs[0];
score[1] = sbegs[1];
score[2] = num_min(score[0], score[1]);
for(i=0;i<W*WORDSIZE;i++){
b1 = striped_epi2_seqedit_getval(us[0][0], W, i);
b2 = striped_epi2_seqedit_getval(us[0][1], W, i);
score[0] = score[0] + b2 - b1;
b1 = striped_epi2_seqedit_getval(us[1][0], W, i);
b2 = striped_epi2_seqedit_getval(us[1][1], W, i);
score[1] = score[1] + b2 - b1;
b1 = striped_epi2_seqedit_getval(us[2][0], W, i);
b2 = striped_epi2_seqedit_getval(us[2][1], W, i);
score[2] = score[2] + b2 - b1;
score[3] = num_min(score[0], score[1]);
if(score[2] != score[3]){
fflush(stdout); fprintf(stderr, " -- [%d] [%d,%d] %d != %d in %s -- %s:%d --\n", i, score[0], score[1], score[2], score[3], __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
}
}
#endif
}
static inline seqalign_result_t striped_epi2_seqedit_pairwise(u1i *qseq, u4i qlen, u1i *tseq, u4i tlen, b1v *mempool, u4v *cigars, int mode, int verbose){
seqalign_result_t rs;
b1i *memp, *mempb, *qprof, *uts[2], *us[2][2], *hs;
u8i mpsize;
u4i i, W;
int rx, ry, sbeg, smin;
UNUSED(mode);
W = roundup_times(qlen, WORDSIZE * 8) / (WORDSIZE * 8);
mpsize = 0;
mpsize += striped_epi2_seqedit_qprof_size(qlen); mpsize += 2 * W * WORDSIZE * Int64((tlen + 1)); mpsize += W * WORDSIZE; if(mempool){
if(mempool->aligned < WORDSIZE){
fflush(stdout); fprintf(stderr, " -- mempool should be aligned by (%d) but (%d) bytes in %s -- %s:%d --\n", WORDSIZE, mempool->aligned, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
clear_and_encap_b1v(mempool, mpsize);
memp = mempool->buffer + WORDSIZE;
mempb = NULL;
} else {
mempb = malloc(mpsize);
memp = mempb + WORDSIZE;
}
qprof = memp; memp += striped_epi2_seqedit_qprof_size(qlen);
uts[0] = memp; memp += W * WORDSIZE * Int64(tlen + 1);
uts[1] = memp; memp += W * WORDSIZE * Int64(tlen + 1);
hs = memp; memp += W * WORDSIZE;
striped_epi2_seqedit_set_query_prof(qseq, qlen, qprof);
us[0][0] = uts[0];
us[0][1] = uts[1];
rx = qlen - 1;
ry = tlen - 1;
smin = MAX_B4;
striped_epi2_seqedit_row_init(us[0], W, mode);
for(i=0;i<tlen;i++){
sbeg = i + 1;
us[0][0] = uts[0] + Int64(i + 0) * W * WORDSIZE;
us[0][1] = uts[1] + Int64(i + 0) * W * WORDSIZE;
us[1][0] = uts[0] + Int64(i + 1) * W * WORDSIZE;
us[1][1] = uts[1] + Int64(i + 1) * W * WORDSIZE;
striped_epi2_seqedit_row_cal(0, us, hs, qprof, W, tseq[i], mode);
if(verbose){
int vals[2][2] = {{0, 1}, {-1, 2}};
int j, b1, b2, u, u2, v, v2, score, error;
score = i + 1;
v2 = 1;
error = 0;
fprintf(stdout, "[%04d:%c]\t", i, "ACGTN"[tseq[i]]);
for(j=0;j<Int(qlen);j++){
b1 = striped_epi2_seqedit_getval(us[0][0], W, j);
b2 = striped_epi2_seqedit_getval(us[0][1], W, j);
u = vals[b1][b2];
v = v2;
if(qseq[j] == tseq[i] || u == -1 || v == -1){
u2 = 0 - v;
v2 = 0 - u;
} else {
u2 = 1 - v;
v2 = 1 - u;
}
b1 = striped_epi2_seqedit_getval(us[1][0], W, j);
b2 = striped_epi2_seqedit_getval(us[1][1], W, j);
if(error == 0 && u2 != vals[b1][b2]){
fflush(stdout);
fprintf(stderr, " -- j=%d s=%c%c u=%d v=%d u2=%d should be %d in %s -- %s:%d --\n", j, "ACGTN"[qseq[j]], "ACGTN"[tseq[i]], u, v, vals[b1][b2], u2, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
error = 1;
}
if(b1 == 0 && b2 == 1) score ++;
else if(b1 == 1 && b2 == 0) score --;
fprintf(stdout, "%c%03d:%c:%c ", "ACGTN"[qseq[j]], score, "-*+"[vals[b1][b2] + 1], "-*+"[v2 + 1]);
}
fprintf(stdout, "\n");
}
}
rs = striped_epi2_seqedit_backtrace(uts, W, mode, qseq, rx, tseq, ry, cigars);
if(mempb) free(mempb);
return rs;
}
static inline int banded_striped_epi8_seqalign_get_piecewise(b1i gapo1, b1i gape1, b1i gapo2, b1i gape2, int bandwidth){
if(gapo2 < gapo1 && gape2 > gape1 && gapo2 + gape2 < gapo1 + gape1 && (gapo1 - gapo2) / (gape1 - gape2) < bandwidth){
return 2;
} else if(gapo1){
return 1;
} else {
return 0;
}
}
static void banded_striped_epi8_seqalign_piecex_row_init(b1i *us, b1i *es, b1i *qs, int *ubegs, int *rbeg, int mode, u4i bandwidth, b1i max_nt, b1i min_nt, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2){
xint ZERO, MIN, GAP;
u4i k, W, xp;
int s, t;
W = bandwidth / WORDSIZE;
ZERO = mm_set1_epi8(0);
MIN = mm_set1_epi8(SEQALIGN_SCORE_EPI8_MIN);
if(seqalign_mode_type(mode) == SEQALIGN_MODE_GLOBAL || seqalign_mode_type(mode) == SEQALIGN_MODE_EXTEND){
if(gapo2 < gapo1 && gape2 > gape1 && gapo2 + gape2 < gapo1 + gape1 && (gapo1 - gapo2) / (gape1 - gape2) < Int(bandwidth)){
xp = (gapo2 - gapo1) / (gape1 - gape2);
GAP = mm_set1_epi8(gape2);
for(k=0;k<W;k++) mm_store(((xint*)us) + k, GAP);
for(k=0;k<WORDSIZE;k++) ubegs[k] = gape2 * W;
us[0] = gapo1 + gape1 + min_nt - max_nt;
ubegs[0] += us[0] - gape2;
for(k=1;k<xp;k++){
us[banded_striped_epi8_pos2idx(bandwidth, k)] = gape1;
ubegs[k / W] += gape1 - gape2;
}
} else {
GAP = mm_set1_epi8(gape1);
for(k=0;k<W;k++) mm_store(((xint*)us) + k, GAP);
us[0] = gapo1 + gape1 + min_nt - max_nt;
for(k=0;k<WORDSIZE;k++) ubegs[k] = gape1 * W;
ubegs[0] += us[0] - gape1;
}
s = max_nt - min_nt;
for(k=0;k<WORDSIZE;k++){
t = ubegs[k];
ubegs[k] = s;
s += t;
}
ubegs[k] = s;
if(rbeg) *rbeg = 0;
} else {
for(k=0;k<W;k++) mm_store(((xint*)us) + k, ZERO);
if(rbeg) *rbeg = 0;
memset(ubegs, 0, (WORDSIZE + 1) * sizeof(int));
}
if(gapo2 < gapo1 && gape2 > gape1 && gapo2 + gape2 < gapo1 + gape1 && (gapo1 - gapo2) / (gape1 - gape2) < Int(bandwidth)){
for(k=0;k<W;k++) mm_store(((xint*)es) + k, MIN);
for(k=0;k<W;k++) mm_store(((xint*)qs) + k, MIN);
} else if(gapo1){
for(k=0;k<W;k++) mm_store(((xint*)es) + k, MIN);
} else {
}
}
static inline void banded_striped_epi8_seqalign_set_query_prof_native(u1i *qseq, u4i qlen, b1i *qprof, u4i bandwidth, b1i mtx[16]){
b1i *qp;
u4i xlen, x, pos, i, j, W, b;
W = bandwidth / WORDSIZE;
xlen = num_max(qlen, bandwidth); for(x=0;x<=xlen;x++){ qp = qprof + x * 4 * WORDSIZE;
for(j=0;j<WORDSIZE;j++){
pos = x + j * W;
if(pos < qlen){
b = qseq[pos];
b *= 4;
for(i=0;i<4;i++){
qp[i * WORDSIZE + j] = mtx[b + i];
}
} else {
for(i=0;i<4;i++){
qp[i * WORDSIZE + j] = SEQALIGN_SCORE_EPI8_MIN;
}
}
}
}
}
static inline void banded_striped_epi8_seqalign_set_query_prof(u1i *qseq, u4i qlen, b1i *qprof, u4i bandwidth, b1i mtx[16]){
int W, xlen, x, y, z, pos, i, j, jb, je;
b1i c;
W = bandwidth / WORDSIZE;
xlen = num_max(qlen, bandwidth);
z = W * 4 * WORDSIZE - 1;
for(pos=0;pos<2*xlen;pos++){
for(i=0;i<4;i++){
if(pos < Int(qlen)){
c = mtx[(qseq[pos] << 2) + i];
} else {
c = SEQALIGN_SCORE_EPI8_MIN;
}
y = ((pos << 2) + i) * WORDSIZE;
jb = (pos - xlen + W - 1) / W;
jb = num_max(jb, 0);
je = pos / W;
je = num_min(je, WORDSIZE - 1);
for(j=jb;j<=je;j++){
x = y - j * z;
qprof[x] = c;
}
}
}
}
static inline void banded_striped_epi8_seqalign_set_query_prof_hpc(u1i *qseq, u4i qlen, b1i *qprof, u4i bandwidth, b1i mtx[16], b1i bonus){
int W, xlen, x, y, z, pos, i, j, jb, je, c;
W = bandwidth / WORDSIZE;
xlen = num_max(qlen, bandwidth);
z = W * 4 * WORDSIZE - 1;
for(pos=0;pos<2*xlen;pos++){
for(i=0;i<4;i++){
if(pos < Int(qlen)){
c = mtx[(qseq[pos] << 2) + i];
if(pos + 1 < Int(qlen) && qseq[pos] != qseq[pos + 1]){
c += bonus;
}
} else {
c = SEQALIGN_SCORE_EPI8_MIN;
}
y = ((pos << 2) + i) * WORDSIZE;
jb = (pos - xlen + W - 1) / W;
jb = num_max(jb, 0);
je = pos / W;
je = num_min(je, WORDSIZE - 1);
for(j=jb;j<=je;j++){
x = y - j * z;
qprof[x] = c;
}
}
}
}
#if 0#else
#define banded_striped_epi8_seqalign_piecex_row_check_ubegs(us, ubegs, W)
#endif
static inline void banded_striped_epi8_seqalign_piecex_row_movx(b1i *us[2], b1i *es[2], b1i *qs[2], int *ubegs[2], u4i W, u4i movx, int piecewise, b1i nt_max, b1i nt_min, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2){
xint u, x, UBS[4], SHUFF1, SHUFF2;
u4i i, div, mov, cyc;
u1i shuff[WORDSIZE];
#ifdef __AVX2__
fflush(stdout); fprintf(stderr, " -- Cannot work with AVX2 in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
#endif
banded_striped_epi8_seqalign_piecex_row_check_ubegs(us[1], ubegs[1], W);
if(movx >= W * WORDSIZE){
memset(us[0], 0, W * WORDSIZE);
if(piecewise) memset(es[0], 0, W * WORDSIZE);
if(piecewise == 2) memset(qs[0], 0, W * WORDSIZE);
memset(us[0], 0, W * WORDSIZE);
for(i=0;i<=WORDSIZE;i++) ubegs[0][i] = SEQALIGN_SCORE_MIN;
return;
} else if(movx == 0){
memcpy(us[0], us[1], W * WORDSIZE);
if(piecewise){
memcpy(es[0], es[1], W * WORDSIZE);
}
if(piecewise == 2){
memcpy(qs[0], qs[1], W * WORDSIZE);
}
memcpy(ubegs[0], ubegs[1], (WORDSIZE + 1) * sizeof(int));
return;
}
cyc = movx / W;
for(i=0;i+cyc<WORDSIZE;i++) shuff[i] = i + cyc;
for(;i<WORDSIZE;i++) shuff[i] = 0xFF;
SHUFF1 = mm_load((xint*)shuff);
SHUFF2 = mm_srli(SHUFF1, 1);
SHUFF2 = mm_insert_epi8(SHUFF2, 0xFF, WORDSIZE - 1);
cyc = movx / W;
mov = movx % W;
div = W - mov;
if(cyc){
for(i=0;i<div;i++){
u = mm_load(((xint*)us[1]) + i + mov);
u = mm_shuffle(u, SHUFF1); mm_store(((xint*)us[0]) + i, u);
}
if(piecewise){
for(i=0;i<div;i++){
x = mm_load(((xint*)es[1]) + i + mov);
x = mm_shuffle(x, SHUFF1);
mm_store(((xint*)es[0]) + i, x);
}
}
if(piecewise == 2){
for(i=0;i<div;i++){
x = mm_load(((xint*)qs[1]) + i + mov);
x = mm_shuffle(x, SHUFF1);
mm_store(((xint*)qs[0]) + i, x);
}
}
} else {
memcpy(us[0], us[1] + mov * WORDSIZE, div * WORDSIZE);
if(piecewise){
memcpy(es[0], es[1] + mov * WORDSIZE, div * WORDSIZE);
}
if(piecewise == 2){
memcpy(qs[0], qs[1] + mov * WORDSIZE, div * WORDSIZE);
}
}
if(mov){
{
UBS[0] = mm_load(((xint*)ubegs[1]) + 0);
UBS[1] = mm_load(((xint*)ubegs[1]) + 1);
UBS[2] = mm_load(((xint*)ubegs[1]) + 2);
UBS[3] = mm_load(((xint*)ubegs[1]) + 3);
}
for(i=div;i<W;i++){
u = mm_load(((xint*)us[1]) + i - div);
UBS[0] = mm_add_epi32(UBS[0], mm_cvtepi8x0_epi32(u));
UBS[1] = mm_add_epi32(UBS[1], mm_cvtepi8x1_epi32(u));
UBS[2] = mm_add_epi32(UBS[2], mm_cvtepi8x2_epi32(u));
UBS[3] = mm_add_epi32(UBS[3], mm_cvtepi8x3_epi32(u));
u = mm_shuffle(u, SHUFF2); mm_store(((xint*)us[0]) + i, u);
}
{
mm_store(((xint*)ubegs[0]) + 0, UBS[0]);
mm_store(((xint*)ubegs[0]) + 1, UBS[1]);
mm_store(((xint*)ubegs[0]) + 2, UBS[2]);
mm_store(((xint*)ubegs[0]) + 3, UBS[3]);
}
if(piecewise){
for(i=div;i<W;i++){
x = mm_load(((xint*)es[1]) + i - div);
x = mm_shuffle(x, SHUFF2);
mm_store(((xint*)es[0]) + i, x);
}
}
if(piecewise == 2){
for(i=div;i<W;i++){
x = mm_load(((xint*)qs[1]) + i - div);
x = mm_shuffle(x, SHUFF2);
mm_store(((xint*)qs[0]) + i, x);
}
}
}
{
u4i a, a2, b, b2, d;
int c;
if(mov){
memmove(ubegs[0], ubegs[0] + cyc, (WORDSIZE - cyc) * sizeof(int));
} else {
memmove(ubegs[0], ubegs[1] + cyc, (WORDSIZE - cyc) * sizeof(int));
}
for(i=WORDSIZE-cyc;i<=WORDSIZE;i++) ubegs[0][i] = ubegs[1][WORDSIZE];
banded_striped_epi8_seqalign_piecex_row_check_ubegs(us[0], ubegs[0], W);
if(piecewise == 2) d = (gapo1 - gapo2) / (gape2 - gape1);
else d = W * WORDSIZE + 1;
i = W * WORDSIZE - movx;
a = i % W;
a2 = (i + d) % W;
b = i / W;
b2 = (i + d) / W;
if(piecewise == 2){
c = num_min(nt_min, gapo2 + gape2) - 1 - nt_max + (gapo2 + gape2);
} else {
c = num_min(nt_min, gapo1 + gape1) - 1 - nt_max + (gapo1 + gape1);
}
us[0][(i % W) * WORDSIZE + (i / W)] = c;
a ++;
for(;b<WORDSIZE&&b<=b2;b++){
if(b == b2){
c += (a2 - a) * gape1;
for(;a<a2;a++) us[0][a * WORDSIZE + b] = gape1;
a = a2;
if(a2 < W) break;
}
c += (W - a) * gape1;
for(;a<W;a++) us[0][a * WORDSIZE + b] = gape1;
ubegs[0][b + 1] += c;
a = 0;
}
for(;b<WORDSIZE;b++){
c += (W - a) * gape2;
for(;a<W;a++) us[0][a * WORDSIZE + b] = gape2;
ubegs[0][b + 1] += c;
a = 0;
}
banded_striped_epi8_seqalign_piecex_row_check_ubegs(us[0], ubegs[0], W);
}
}
static inline void banded_striped_epi8_seqalign_piecex_row_mov(b1i *us[2], b1i *es[2], b1i *qs[2], int *ubegs[2], u4i W, u4i mov, int piecewise, b1i nt_min, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2){
xint u, x, UBS[4];
u4i i, div;
UNUSED(nt_min);
UNUSED(gapo1);
UNUSED(gape1);
UNUSED(gapo2);
UNUSED(gape2);
if(mov > W){
fprintf(stderr, " -- mov(%d) > W(%d) in %s -- %s:%d --\n", mov, W, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
div = W - mov;
for(i=0;i<div;i++){
u = mm_load(((xint*)us[1]) + i + mov);
mm_store(((xint*)us[0]) + i, u);
}
if(piecewise){
for(i=0;i<div;i++){
x = mm_load(((xint*)es[1]) + i + mov);
mm_store(((xint*)es[0]) + i, x);
}
}
if(piecewise == 2){
for(i=0;i<div;i++){
x = mm_load(((xint*)qs[1]) + i + mov);
mm_store(((xint*)qs[0]) + i, x);
}
}
if(!mov){
memcpy(ubegs[0], ubegs[1], (WORDSIZE + 1) * sizeof(int));
return;
}
{
u = mm_load(((xint*)us[1]) + i + mov - W);
u = mm_srli(u, 1);
u = mm_insert_epi8(u, SEQALIGN_SCORE_EPI8_MIN, WORDSIZE - 1);
}
for(i=div;i<W;i++){
mm_store(((xint*)us[0]) + i, u);
u = mm_load(((xint*)us[1]) + (i + mov - W + 1) % W);
u = mm_srli(u, 1);
}
{
UBS[0] = mm_load(((xint*)ubegs[1]) + 0);
UBS[1] = mm_load(((xint*)ubegs[1]) + 1);
UBS[2] = mm_load(((xint*)ubegs[1]) + 2);
UBS[3] = mm_load(((xint*)ubegs[1]) + 3);
}
for(i=0;i<mov;i++){
u = mm_load(((xint*)us[1]) + i);
UBS[0] = mm_add_epi32(UBS[0], mm_cvtepi8x0_epi32(u));
UBS[1] = mm_add_epi32(UBS[1], mm_cvtepi8x1_epi32(u));
UBS[2] = mm_add_epi32(UBS[2], mm_cvtepi8x2_epi32(u));
UBS[3] = mm_add_epi32(UBS[3], mm_cvtepi8x3_epi32(u));
}
{
mm_store(((xint*)ubegs[0]) + 0, UBS[0]);
mm_store(((xint*)ubegs[0]) + 1, UBS[1]);
mm_store(((xint*)ubegs[0]) + 2, UBS[2]);
mm_store(((xint*)ubegs[0]) + 3, UBS[3]);
ubegs[0][WORDSIZE] = ubegs[1][WORDSIZE] + SEQALIGN_SCORE_EPI8_MIN;
}
if(piecewise){
for(i=div;i<W;i++){
x = mm_load(((xint*)es[1]) + i + mov - W);
x = mm_srli(x, 1);
mm_store(((xint*)es[0]) + i, x);
}
}
if(piecewise == 2){
for(i=div;i<W;i++){
x = mm_load(((xint*)qs[1]) + i + mov - W);
x = mm_srli(x, 1);
mm_store(((xint*)qs[0]) + i, x);
}
}
}
static inline void banded_striped_epi8_seqalign_piecex_row_merge(b1i *us[3], b1i *es[3], b1i *qs[3], int *ubegs[3], u4i W, int piecewise){
xint s[2][4], t[2][2], m[2][2], u[2], x[2][2];
u4i i, ie, j, k, p;
banded_striped_epi8_seqalign_piecex_row_check_ubegs(us[0], ubegs[0], W);
banded_striped_epi8_seqalign_piecex_row_check_ubegs(us[1], ubegs[1], W);
for(k=0;k<2;k++){
for(j=0;j<4;j++){
s[k][j] = mm_load(((xint*)ubegs[k]) + j);
}
}
p = 0;
for(j=0;j<4;j++){
mm_store(((xint*)ubegs[2]) + j, mm_max_epi32(s[0][j], s[1][j]));
}
ubegs[2][WORDSIZE] = num_max(ubegs[0][WORDSIZE], ubegs[1][WORDSIZE]);
#if 1
int rs[3][WORDSIZE];
memcpy(rs[0], ubegs[0], WORDSIZE * sizeof(int));
memcpy(rs[1], ubegs[1], WORDSIZE * sizeof(int));
for(i=0;i<WORDSIZE;i++) rs[2][i] = num_max(rs[0][i], rs[1][i]);
#endif
for(i=0;i<W;){
ie = num_min(i + 256, W); {
u[0] = mm_sub_epi32(s[0][0], s[1][0]);
u[0] = mm_max_epi32(u[0], mm_set1_epi32(-0x7FFF));
u[0] = mm_min_epi32(u[0], mm_set1_epi32( 0x7FFF));
x[0][0] = mm_srai_epi32(u[0], 1); x[1][0] = mm_sub_epi32(x[0][0], u[0]);
u[0] = mm_sub_epi32(s[0][1], s[1][1]);
u[0] = mm_max_epi32(u[0], mm_set1_epi32(-0x7FFF));
u[0] = mm_min_epi32(u[0], mm_set1_epi32( 0x7FFF));
x[0][1] = mm_srai_epi32(u[0], 1);
x[1][1] = mm_sub_epi32(x[0][1], u[0]);
s[0][0] = mm_sub_epi32(s[0][0], x[0][0]);
s[0][1] = mm_sub_epi32(s[0][1], x[0][1]);
s[1][0] = mm_sub_epi32(s[1][0], x[1][0]);
s[1][1] = mm_sub_epi32(s[1][1], x[1][1]);
t[0][0] = mm_packs_epi32(x[0][0], x[0][1]);
t[1][0] = mm_packs_epi32(x[1][0], x[1][1]);
u[0] = mm_sub_epi32(s[0][2], s[1][2]);
u[0] = mm_max_epi32(u[0], mm_set1_epi32(-0x7FFF));
u[0] = mm_min_epi32(u[0], mm_set1_epi32( 0x7FFF));
x[0][0] = mm_srai_epi32(u[0], 1);
x[1][0] = mm_sub_epi32(x[0][0], u[0]);
u[0] = mm_sub_epi32(s[0][3], s[1][3]);
u[0] = mm_max_epi32(u[0], mm_set1_epi32(-0x7FFF));
u[0] = mm_min_epi32(u[0], mm_set1_epi32( 0x7FFF));
x[0][1] = mm_srai_epi32(u[0], 1);
x[1][1] = mm_sub_epi32(x[0][1], u[0]);
s[0][2] = mm_sub_epi32(s[0][2], x[0][0]);
s[0][3] = mm_sub_epi32(s[0][3], x[0][1]);
s[1][2] = mm_sub_epi32(s[1][2], x[1][0]);
s[1][3] = mm_sub_epi32(s[1][3], x[1][1]);
t[0][1] = mm_packs_epi32(x[0][0], x[0][1]);
t[1][1] = mm_packs_epi32(x[1][0], x[1][1]);
p = 0;
m[p][0] = mm_max_epi16(t[0][0], t[1][0]);
m[p][1] = mm_max_epi16(t[0][1], t[1][1]);
}
for(;i<ie;i++){
p = !p;
u[0] = mm_load(((xint*)us[0]) + i);
t[0][0] = mm_adds_epi16(t[0][0], mm_cvtepi8x0_epi16(u[0]));
t[0][1] = mm_adds_epi16(t[0][1], mm_cvtepi8x1_epi16(u[0]));
u[1] = mm_load(((xint*)us[1]) + i);
t[1][0] = mm_adds_epi16(t[1][0], mm_cvtepi8x0_epi16(u[1]));
t[1][1] = mm_adds_epi16(t[1][1], mm_cvtepi8x1_epi16(u[1]));
m[p][0] = mm_max_epi16(t[0][0], t[1][0]);
m[!p][0] = mm_subs_epi16(m[p][0], m[!p][0]);
m[p][1] = mm_max_epi16(t[0][1], t[1][1]);
m[!p][1] = mm_subs_epi16(m[p][1], m[!p][1]);
u[0] = mm_packs_epi16(m[!p][0], m[!p][1]);
mm_store(((xint*)us[2]) + i, u[0]);
#if 1
for(j=0;j<WORDSIZE;j++){
rs[0][j] += us[0][i * WORDSIZE + j];
rs[1][j] += us[1][i * WORDSIZE + j];
int rm = num_max(rs[0][j], rs[1][j]);
if(rm - rs[2][j] != us[2][i * WORDSIZE + j]){
fflush(stdout); fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
rs[2][j] = rm;
}
#endif
if(piecewise == 0) continue;
u[0] = mm_load(((xint*)es[0]) + i);
x[0][0] = mm_adds_epi16(t[0][0], mm_cvtepi8x0_epi16(u[0]));
x[0][1] = mm_adds_epi16(t[0][1], mm_cvtepi8x1_epi16(u[0]));
u[1] = mm_load(((xint*)es[1]) + i);
x[1][0] = mm_adds_epi16(t[1][0], mm_cvtepi8x0_epi16(u[1]));
x[1][1] = mm_adds_epi16(t[1][1], mm_cvtepi8x1_epi16(u[1]));
m[!p][0] = mm_max_epi16(x[0][0], x[1][0]);
m[!p][0] = mm_subs_epi16(m[!p][0], m[p][0]);
m[!p][1] = mm_max_epi16(x[0][1], x[1][1]);
m[!p][1] = mm_subs_epi16(m[!p][1], m[p][1]);
u[0] = mm_packs_epi16(m[!p][0], m[!p][1]);
mm_store(((xint*)es[2]) + i, u[0]);
if(piecewise == 1) continue;
u[0] = mm_load(((xint*)qs[0]) + i);
x[0][0] = mm_adds_epi16(t[0][0], mm_cvtepi8x0_epi16(u[0]));
x[0][1] = mm_adds_epi16(t[0][1], mm_cvtepi8x1_epi16(u[0]));
u[1] = mm_load(((xint*)qs[1]) + i);
x[1][0] = mm_adds_epi16(t[1][0], mm_cvtepi8x0_epi16(u[1]));
x[1][1] = mm_adds_epi16(t[1][1], mm_cvtepi8x1_epi16(u[1]));
m[!p][0] = mm_max_epi16(x[0][0], x[1][0]);
m[!p][0] = mm_subs_epi16(m[!p][0], m[p][0]);
m[!p][1] = mm_max_epi16(x[0][1], x[1][1]);
m[!p][1] = mm_subs_epi16(m[!p][1], m[p][1]);
u[0] = mm_packs_epi16(m[!p][0], m[!p][1]);
mm_store(((xint*)qs[2]) + i, u[0]);
}
{
s[0][0] = mm_add_epi32(s[0][0], mm_cvtepi16x0_epi32(t[0][0]));
s[0][1] = mm_add_epi32(s[0][1], mm_cvtepi16x1_epi32(t[0][0]));
s[0][2] = mm_add_epi32(s[0][2], mm_cvtepi16x0_epi32(t[0][1]));
s[0][3] = mm_add_epi32(s[0][3], mm_cvtepi16x1_epi32(t[0][1]));
s[1][0] = mm_add_epi32(s[1][0], mm_cvtepi16x0_epi32(t[1][0]));
s[1][1] = mm_add_epi32(s[1][1], mm_cvtepi16x1_epi32(t[1][0]));
s[1][2] = mm_add_epi32(s[1][2], mm_cvtepi16x0_epi32(t[1][1]));
s[1][3] = mm_add_epi32(s[1][3], mm_cvtepi16x1_epi32(t[1][1]));
}
}
banded_striped_epi8_seqalign_piecex_row_check_ubegs(us[2], ubegs[2], W);
}
static inline int banded_striped_epi8_seqalign_piecex_row_cal_tail_codes(xint h, xint u, xint v, b1i *us[2], b1i *fs, int *ubegs[2], u4i W){
u4i i;
UNUSED(W);
v = mm_subs_epi8(h, u); mm_store((xint*)fs, v);
for(i=1;i<=WORDSIZE;i++){
ubegs[1][i] = ubegs[0][i] + fs[i - 1];
}
v = mm_slli(v, 1);
u = mm_load(((xint*)us[1]) + 0);
u = mm_subs_epi8(u, v); mm_store(((xint*)us[1]) + 0, u);
ubegs[1][0] = ubegs[0][0] + us[1][0];
us[1][0] = 0;
banded_striped_epi8_seqalign_piecex_row_check_ubegs(us[1], ubegs[1], W);
return ubegs[1][0];
}
static inline xint banded_striped_epi8_seqalign_piecex_row_cal_FPenetration_codes(u4i W, xint f, b1i *fs, int *ubegs[2], b1i gape){
int i, s, t;
f = mm_slli(f, 1);
mm_store((xint*)fs, f);
fs[0] = SEQALIGN_SCORE_EPI8_MIN;
t = W * gape;
s = t + fs[0] - (ubegs[0][1] - ubegs[0][0]);
for(i=1;i<WORDSIZE;i++){
if(fs[i] < s) fs[i] = s;
s = t + fs[i] - (ubegs[0][i + 1] - ubegs[0][i]);
}
f = mm_load((xint*)fs);
return f;
}
static inline int banded_striped_epi8_seqalign_piece0_row_btcal(u4i rbeg, u1i base, b1i *us[2], b1i *es[2], b1i *qs[2], b1i *bs, int *ubegs[2], b1i *qprof, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2, u4i W, u4i mov, int rh){
xint h, e, f, u, v, c, d, z;
xint I, D, GapE;
b1i fs[WORDSIZE];
u4i i;
int t, h0, bt0;
UNUSED(es);
UNUSED(qs);
UNUSED(gapo1);
UNUSED(gapo2);
UNUSED(gape2);
UNUSED(mov);
I = mm_set1_epi8(SEQALIGN_BT_I);
D = mm_set1_epi8(SEQALIGN_BT_D);
GapE = mm_set1_epi8(gape1);
f = mm_set1_epi8(SEQALIGN_SCORE_EPI8_MIN);
h0 = (rh - ubegs[0][0]) + qprof[((rbeg + 0) * 4 + base) * WORDSIZE]; t = us[0][0] + gape1; if(h0 >= t){
if(h0 > SEQALIGN_SCORE_EPI8_MAX) h0 = SEQALIGN_SCORE_EPI8_MAX; bt0 = SEQALIGN_BT_M;
} else {
h0 = SEQALIGN_SCORE_EPI8_MIN;
bt0 = SEQALIGN_BT_D;
}
h = mm_load(((xint*)qprof) + (rbeg + 0) * 4 + base);
h = mm_insert_epi8(h, h0, 0);
for(i=0;i<W;i++){
u = mm_load(((xint*)us[0]) + i);
e = mm_adds_epi8(u, GapE);
h = mm_max_epi8(e, h);
h = mm_max_epi8(f, h);
f = mm_adds_epi8(h, GapE);
f = mm_subs_epi8(f, u);
h = mm_load(((xint*)qprof) + (rbeg + i + 1) * 4 + base);
}
f = banded_striped_epi8_seqalign_piecex_row_cal_FPenetration_codes(W, f, fs, ubegs, gape1);
v = mm_set1_epi8(0);
h = mm_load(((xint*)qprof) + (rbeg + 0) * 4 + base);
z = mm_insert_epi8(h, h0, 0);
u = mm_set1_epi8(0); for(i=0;i<W;i++){
u = mm_load(((xint*)us[0]) + i);
e = mm_adds_epi8(u, GapE);
c = mm_cmpgt_epi8(e, z);
d = mm_and(c, D); h = mm_max_epi8(e, z);
c = mm_cmpgt_epi8(f, h);
d = mm_blendv(d, I, c);
h = mm_max_epi8(f, h);
mm_store(((xint*)bs) + i, d);
v = mm_subs_epi8(h, v);
mm_store(((xint*)us[1]) + i, v);
v = mm_subs_epi8(h, u);
f = mm_adds_epi8(h, GapE);
f = mm_subs_epi8(f, u);
z = mm_load(((xint*)qprof) + (rbeg + i + 1) * 4 + base);
}
return banded_striped_epi8_seqalign_piecex_row_cal_tail_codes(h, u, v, us, fs, ubegs, W);
}
static inline int banded_striped_epi8_seqalign_piece0_row_cal(u4i rbeg, u1i base, b1i *us[2], b1i *es[2], b1i *qs[2], int *ubegs[2], b1i *qprof, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2, u4i W, u4i mov, int rh){
xint h, e, f, u, v, z;
xint GapE;
b1i fs[WORDSIZE];
u4i i;
int t, h0, bt0;
UNUSED(es);
UNUSED(qs);
UNUSED(gapo1);
UNUSED(gapo2);
UNUSED(gape2);
UNUSED(mov);
GapE = mm_set1_epi8(gape1);
f = mm_set1_epi8(SEQALIGN_SCORE_EPI8_MIN);
h0 = (rh - ubegs[0][0]) + qprof[((rbeg + 0) * 4 + base) * WORDSIZE]; t = us[0][0] + gape1; if(h0 >= t){
if(h0 > SEQALIGN_SCORE_EPI8_MAX) h0 = SEQALIGN_SCORE_EPI8_MAX; bt0 = SEQALIGN_BT_M;
} else {
h0 = SEQALIGN_SCORE_EPI8_MIN;
bt0 = SEQALIGN_BT_D;
}
h = mm_load(((xint*)qprof) + (rbeg + 0) * 4 + base);
h = mm_insert_epi8(h, h0, 0);
for(i=0;i<W;i++){
u = mm_load(((xint*)us[0]) + i);
e = mm_adds_epi8(u, GapE);
h = mm_max_epi8(e, h);
h = mm_max_epi8(f, h);
f = mm_adds_epi8(h, GapE);
f = mm_subs_epi8(f, u);
h = mm_load(((xint*)qprof) + (rbeg + i + 1) * 4 + base);
}
f = banded_striped_epi8_seqalign_piecex_row_cal_FPenetration_codes(W, f, fs, ubegs, gape1);
v = mm_set1_epi8(0);
h = mm_load(((xint*)qprof) + (rbeg + 0) * 4 + base);
z = mm_insert_epi8(h, h0, 0);
u = mm_set1_epi8(0); for(i=0;i<W;i++){
u = mm_load(((xint*)us[0]) + i);
e = mm_adds_epi8(u, GapE);
h = mm_max_epi8(e, z);
h = mm_max_epi8(f, h);
v = mm_subs_epi8(h, v);
mm_store(((xint*)us[1]) + i, v);
v = mm_subs_epi8(h, u);
f = mm_adds_epi8(h, GapE);
f = mm_subs_epi8(f, u);
z = mm_load(((xint*)qprof) + (rbeg + i + 1) * 4 + base);
}
return banded_striped_epi8_seqalign_piecex_row_cal_tail_codes(h, u, v, us, fs, ubegs, W);
}
static inline int banded_striped_epi8_seqalign_piece1_row_btcal(u4i rbeg, u1i base, b1i *us[2], b1i *es[2], b1i *qs[2], b1i *bs, int *ubegs[2], b1i *qprof, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2, u4i W, u4i mov, int rh){
xint h, e, f, u, v, c, d, z;
xint I, IE, D, DE, GapOE, GapE;
b1i fs[WORDSIZE];
u4i i;
int t, h0, bt0;
UNUSED(qs);
UNUSED(gapo2);
UNUSED(gape2);
UNUSED(mov);
I = mm_set1_epi8(SEQALIGN_BT_I);
IE = mm_set1_epi8(SEQALIGN_BT1_IE);
D = mm_set1_epi8(SEQALIGN_BT_D);
DE = mm_set1_epi8(SEQALIGN_BT1_DE);
GapOE = mm_set1_epi8(gapo1 + gape1);
GapE = mm_set1_epi8(gape1);
f = mm_set1_epi8(SEQALIGN_SCORE_EPI8_MIN);
h0 = (rh - ubegs[0][0]) + qprof[((rbeg + 0) * 4 + base) * WORDSIZE]; t = us[0][0] + es[0][0]; if(h0 >= t){
if(h0 > SEQALIGN_SCORE_EPI8_MAX) h0 = SEQALIGN_SCORE_EPI8_MAX; bt0 = SEQALIGN_BT_M;
} else {
h0 = SEQALIGN_SCORE_EPI8_MIN;
bt0 = SEQALIGN_BT_D;
}
h = mm_load(((xint*)qprof) + (rbeg + 0) * 4 + base);
h = mm_insert_epi8(h, h0, 0);
for(i=0;i<W;i++){
u = mm_load(((xint*)us[0]) + i);
e = mm_load(((xint*)es[0]) + i);
e = mm_adds_epi8(e, u);
h = mm_max_epi8(e, h);
h = mm_max_epi8(f, h);
f = mm_adds_epi8(f, GapE);
h = mm_adds_epi8(h, GapOE);
f = mm_max_epi8(f, h);
f = mm_subs_epi8(f, u);
h = mm_load(((xint*)qprof) + (rbeg + i + 1) * 4 + base);
}
f = banded_striped_epi8_seqalign_piecex_row_cal_FPenetration_codes(W, f, fs, ubegs, gape1);
u = mm_set1_epi8(0); v = mm_set1_epi8(0);
h = mm_load(((xint*)qprof) + (rbeg + 0) * 4 + base);
z = mm_insert_epi8(h, h0, 0);
for(i=0;__builtin_expect(i<W, 0);i++){
u = mm_load(((xint*)us[0]) + i);
e = mm_load(((xint*)es[0]) + i);
e = mm_adds_epi8(e, u);
c = mm_cmpgt_epi8(e, z);
d = mm_and(c, D); h = mm_max_epi8(e, z);
c = mm_cmpgt_epi8(f, h);
d = mm_blendv(d, I, c);
h = mm_max_epi8(f, h);
v = mm_subs_epi8(h, v);
mm_store(((xint*)us[1]) + i, v);
v = mm_subs_epi8(h, u);
e = mm_adds_epi8(e, GapE);
e = mm_subs_epi8(e, h);
c = mm_cmpgt_epi8(e, GapOE);
d = mm_or(d, mm_and(c, DE));
e = mm_max_epi8(e, GapOE);
mm_store(((xint*)es[1]) + i, e);
f = mm_adds_epi8(f, GapE);
h = mm_adds_epi8(h, GapOE);
c = mm_cmpgt_epi8(f, h);
d = mm_or(d, mm_and(c, IE));
f = mm_max_epi8(f, h);
mm_store(((xint*)bs) + i, d);
f = mm_subs_epi8(f, u);
z = mm_load(((xint*)qprof) + (rbeg + i + 1) * 4 + base);
}
h = mm_subs_epi8(h, GapOE);
return banded_striped_epi8_seqalign_piecex_row_cal_tail_codes(h, u, v, us, fs, ubegs, W);
}
static inline int banded_striped_epi8_seqalign_piece1_row_cal(u4i rbeg, u1i base, b1i *us[2], b1i *es[2], b1i *qs[2], int *ubegs[2], b1i *qprof, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2, u4i W, u4i mov, int rh){
xint h, e, f, u, v, z;
xint GapOE, GapE;
b1i fs[WORDSIZE];
u4i i;
int t, h0, bt0;
UNUSED(qs);
UNUSED(gapo2);
UNUSED(gape2);
UNUSED(mov);
GapOE = mm_set1_epi8(gapo1 + gape1);
GapE = mm_set1_epi8(gape1);
f = mm_set1_epi8(SEQALIGN_SCORE_EPI8_MIN);
h0 = (rh - ubegs[0][0]) + qprof[((rbeg + 0) * 4 + base) * WORDSIZE]; t = us[0][0] + es[0][0]; if(h0 >= t){
if(h0 > SEQALIGN_SCORE_EPI8_MAX) h0 = SEQALIGN_SCORE_EPI8_MAX; bt0 = SEQALIGN_BT_M;
} else {
h0 = SEQALIGN_SCORE_EPI8_MIN;
bt0 = SEQALIGN_BT_D;
}
h = mm_load(((xint*)qprof) + (rbeg + 0) * 4 + base);
h = mm_insert_epi8(h, h0, 0);
for(i=0;i<W;i++){
u = mm_load(((xint*)us[0]) + i);
e = mm_load(((xint*)es[0]) + i);
e = mm_adds_epi8(e, u);
h = mm_max_epi8(e, h);
h = mm_max_epi8(f, h);
f = mm_adds_epi8(f, GapE);
h = mm_adds_epi8(h, GapOE);
f = mm_max_epi8(f, h);
f = mm_subs_epi8(f, u);
h = mm_load(((xint*)qprof) + (rbeg + i + 1) * 4 + base);
}
f = banded_striped_epi8_seqalign_piecex_row_cal_FPenetration_codes(W, f, fs, ubegs, gape1);
u = mm_set1_epi8(0); v = mm_set1_epi8(0);
h = mm_load(((xint*)qprof) + (rbeg + 0) * 4 + base);
z = mm_insert_epi8(h, h0, 0);
for(i=0;__builtin_expect(i<W, 0);i++){
u = mm_load(((xint*)us[0]) + i);
e = mm_load(((xint*)es[0]) + i);
e = mm_adds_epi8(e, u);
h = mm_max_epi8(e, z);
h = mm_max_epi8(f, h);
v = mm_subs_epi8(h, v);
mm_store(((xint*)us[1]) + i, v);
v = mm_subs_epi8(h, u);
e = mm_adds_epi8(e, GapE);
e = mm_subs_epi8(e, h);
e = mm_max_epi8(e, GapOE);
mm_store(((xint*)es[1]) + i, e);
f = mm_adds_epi8(f, GapE);
h = mm_adds_epi8(h, GapOE);
f = mm_max_epi8(f, h);
f = mm_subs_epi8(f, u);
z = mm_load(((xint*)qprof) + (rbeg + i + 1) * 4 + base);
}
h = mm_subs_epi8(h, GapOE);
return banded_striped_epi8_seqalign_piecex_row_cal_tail_codes(h, u, v, us, fs, ubegs, W);
}
static inline int banded_striped_epi8_seqalign_piece2_row_btcal(u4i rbeg, u1i base, b1i *us[2], b1i *es[2], b1i *qs[2], b1i *bs, int *ubegs[2], b1i *qprof, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2, u4i W, u4i mov, int rh){
xint h, e, q, f, g, u, v, c, d, z;
xint I1, I2, IE1, IE2, D1, D2, DE1, DE2, GapOE, GapE, GapQP, GapP, GapOQ;
b1i fs[WORDSIZE];
u4i i;
int t, h0, bt0;
UNUSED(mov);
I1 = mm_set1_epi8(SEQALIGN_BT2_I1);
I2 = mm_set1_epi8(SEQALIGN_BT2_I2);
IE1 = mm_set1_epi8(SEQALIGN_BT2_IE1);
IE2 = mm_set1_epi8(SEQALIGN_BT2_IE2);
D1 = mm_set1_epi8(SEQALIGN_BT2_D1);
D2 = mm_set1_epi8(SEQALIGN_BT2_D2);
DE1 = mm_set1_epi8(SEQALIGN_BT2_DE1);
DE2 = mm_set1_epi8(SEQALIGN_BT2_DE2);
GapOE = mm_set1_epi8(gapo1 + gape1);
GapE = mm_set1_epi8(gape1);
GapQP = mm_set1_epi8(gapo2 + gape2);
GapP = mm_set1_epi8(gape2);
GapOQ = mm_subs_epi8(GapOE, GapQP);
f = mm_set1_epi8(SEQALIGN_SCORE_EPI8_MIN);
g = mm_set1_epi8(SEQALIGN_SCORE_EPI8_MIN);
h0 = (rh - ubegs[0][0]) + qprof[((rbeg + 0) * 4 + base) * WORDSIZE]; t = us[0][0] + num_max(es[0][0], qs[0][0]); if(h0 >= t){
if(h0 > SEQALIGN_SCORE_EPI8_MAX) h0 = SEQALIGN_SCORE_EPI8_MAX; bt0 = SEQALIGN_BT_M;
} else {
h0 = SEQALIGN_SCORE_EPI8_MIN;
bt0 = SEQALIGN_BT_D;
}
h = mm_load(((xint*)qprof) + (rbeg + 0) * 4 + base);
h = mm_insert_epi8(h, h0, 0);
for(i=0;i<W;i++){
u = mm_load(((xint*)us[0]) + i);
e = mm_load(((xint*)es[0]) + i);
q = mm_load(((xint*)qs[0]) + i);
e = mm_adds_epi8(e, u);
q = mm_adds_epi8(q, u);
h = mm_max_epi8(e, h);
h = mm_max_epi8(q, h);
h = mm_max_epi8(f, h);
h = mm_max_epi8(g, h);
f = mm_adds_epi8(f, GapE);
h = mm_adds_epi8(h, GapOE);
f = mm_max_epi8(f, h);
f = mm_subs_epi8(f, u);
g = mm_adds_epi8(g, GapP);
h = mm_subs_epi8(h, GapOQ); g = mm_max_epi8(g, h);
g = mm_subs_epi8(g, u);
h = mm_load(((xint*)qprof) + (rbeg + i + 1) * 4 + base);
}
f = banded_striped_epi8_seqalign_piecex_row_cal_FPenetration_codes(W, f, fs, ubegs, gape1);
g = banded_striped_epi8_seqalign_piecex_row_cal_FPenetration_codes(W, g, fs, ubegs, gape2);
v = mm_set1_epi8(0);
h = mm_load(((xint*)qprof) + (rbeg + 0) * 4 + base);
z = mm_insert_epi8(h, h0, 0);
u = mm_set1_epi8(0); for(i=0;i<W;i++){
u = mm_load(((xint*)us[0]) + i);
e = mm_load(((xint*)es[0]) + i);
q = mm_load(((xint*)qs[0]) + i);
e = mm_adds_epi8(e, u);
c = mm_cmpgt_epi8(e, z);
d = mm_and(c, D1); h = mm_max_epi8(e, z);
q = mm_adds_epi8(q, u);
c = mm_cmpgt_epi8(q, h);
d = mm_blendv(d, D2, c);
h = mm_max_epi8(q, h);
c = mm_cmpgt_epi8(f, h);
d = mm_blendv(d, I1, c);
h = mm_max_epi8(f, h);
c = mm_cmpgt_epi8(g, h);
d = mm_blendv(d, I2, c);
h = mm_max_epi8(g, h);
v = mm_subs_epi8(h, v);
mm_store(((xint*)us[1]) + i, v);
v = mm_subs_epi8(h, u);
e = mm_adds_epi8(e, GapE);
e = mm_subs_epi8(e, h);
c = mm_cmpgt_epi8(e, GapOE);
d = mm_or(d, mm_and(c, DE1));
e = mm_max_epi8(e, GapOE);
mm_store(((xint*)es[1]) + i, e);
q = mm_adds_epi8(q, GapP);
q = mm_subs_epi8(q, h);
c = mm_cmpgt_epi8(q, GapQP);
d = mm_or(d, mm_and(c, DE2));
q = mm_max_epi8(q, GapQP);
mm_store(((xint*)qs[1]) + i, q);
f = mm_adds_epi8(f, GapE);
h = mm_adds_epi8(h, GapOE);
c = mm_cmpgt_epi8(f, h);
d = mm_or(d, mm_and(c, IE1));
f = mm_max_epi8(f, h);
f = mm_subs_epi8(f, u);
g = mm_adds_epi8(g, GapP);
h = mm_subs_epi8(h, GapOQ); c = mm_cmpgt_epi8(g, h);
d = mm_or(d, mm_and(c, IE2));
g = mm_max_epi8(g, h);
mm_store(((xint*)bs) + i, d);
g = mm_subs_epi8(g, u);
z = mm_load(((xint*)qprof) + (rbeg + i + 1) * 4 + base);
}
h = mm_subs_epi8(h, GapQP);
return banded_striped_epi8_seqalign_piecex_row_cal_tail_codes(h, u, v, us, fs, ubegs, W);
}
static inline int banded_striped_epi8_seqalign_piece2_row_cal(u4i rbeg, u1i base, b1i *us[2], b1i *es[2], b1i *qs[2], int *ubegs[2], b1i *qprof, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2, u4i W, u4i mov, int rh){
xint h, e, q, f, g, u, v, z;
xint GapOE, GapE, GapQP, GapP, GapOQ;
b1i fs[WORDSIZE];
u4i i;
int t, h0, bt0;
UNUSED(mov);
GapOE = mm_set1_epi8(gapo1 + gape1);
GapE = mm_set1_epi8(gape1);
GapQP = mm_set1_epi8(gapo2 + gape2);
GapP = mm_set1_epi8(gape2);
GapOQ = mm_subs_epi8(GapOE, GapQP);
f = mm_set1_epi8(SEQALIGN_SCORE_EPI8_MIN);
g = mm_set1_epi8(SEQALIGN_SCORE_EPI8_MIN);
h0 = (rh - ubegs[0][0]) + qprof[((rbeg + 0) * 4 + base) * WORDSIZE]; t = us[0][0] + num_max(es[0][0], qs[0][0]); if(h0 >= t){
if(h0 > SEQALIGN_SCORE_EPI8_MAX) h0 = SEQALIGN_SCORE_EPI8_MAX; bt0 = SEQALIGN_BT_M;
} else {
h0 = SEQALIGN_SCORE_EPI8_MIN;
bt0 = SEQALIGN_BT_D;
}
h = mm_load(((xint*)qprof) + (rbeg + 0) * 4 + base);
h = mm_insert_epi8(h, h0, 0);
for(i=0;i<W;i++){
u = mm_load(((xint*)us[0]) + i);
e = mm_load(((xint*)es[0]) + i);
q = mm_load(((xint*)qs[0]) + i);
e = mm_adds_epi8(e, u);
q = mm_adds_epi8(q, u);
h = mm_max_epi8(e, h);
h = mm_max_epi8(q, h);
h = mm_max_epi8(f, h);
h = mm_max_epi8(g, h);
f = mm_adds_epi8(f, GapE);
h = mm_adds_epi8(h, GapOE);
f = mm_max_epi8(f, h);
f = mm_subs_epi8(f, u);
g = mm_adds_epi8(g, GapP);
h = mm_subs_epi8(h, GapOQ); g = mm_max_epi8(g, h);
g = mm_subs_epi8(g, u);
h = mm_load(((xint*)qprof) + (rbeg + i + 1) * 4 + base);
}
f = banded_striped_epi8_seqalign_piecex_row_cal_FPenetration_codes(W, f, fs, ubegs, gape1);
g = banded_striped_epi8_seqalign_piecex_row_cal_FPenetration_codes(W, g, fs, ubegs, gape2);
v = mm_set1_epi8(0);
h = mm_load(((xint*)qprof) + (rbeg + 0) * 4 + base);
z = mm_insert_epi8(h, h0, 0);
u = mm_set1_epi8(0); for(i=0;i<W;i++){
u = mm_load(((xint*)us[0]) + i);
e = mm_load(((xint*)es[0]) + i);
q = mm_load(((xint*)qs[0]) + i);
e = mm_adds_epi8(e, u);
h = mm_max_epi8(e, z);
q = mm_adds_epi8(q, u);
h = mm_max_epi8(q, h);
h = mm_max_epi8(f, h);
h = mm_max_epi8(g, h);
v = mm_subs_epi8(h, v);
mm_store(((xint*)us[1]) + i, v);
v = mm_subs_epi8(h, u);
e = mm_adds_epi8(e, GapE);
e = mm_subs_epi8(e, h);
e = mm_max_epi8(e, GapOE);
mm_store(((xint*)es[1]) + i, e);
q = mm_adds_epi8(q, GapP);
q = mm_subs_epi8(q, h);
q = mm_max_epi8(q, GapQP);
mm_store(((xint*)qs[1]) + i, q);
f = mm_adds_epi8(f, GapE);
h = mm_adds_epi8(h, GapOE);
f = mm_max_epi8(f, h);
f = mm_subs_epi8(f, u);
g = mm_adds_epi8(g, GapP);
h = mm_subs_epi8(h, GapOQ); g = mm_max_epi8(g, h);
g = mm_subs_epi8(g, u);
z = mm_load(((xint*)qprof) + (rbeg + i + 1) * 4 + base);
}
h = mm_subs_epi8(h, GapQP);
return banded_striped_epi8_seqalign_piecex_row_cal_tail_codes(h, u, v, us, fs, ubegs, W);
}
static inline int banded_striped_epi8_seqalign_piecex_row_cal(u4i rbeg, u1i base, b1i *us[2], b1i *es[2], b1i *qs[2], int *ubegs[2], b1i *qprof, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2, u4i W, u4i mov, int rh, int piecewise){
if(piecewise == 2) return banded_striped_epi8_seqalign_piece2_row_cal(rbeg, base, us, es, qs, ubegs, qprof, gapo1, gape1, gapo2, gape2, W, mov, rh);
else if(piecewise == 1) return banded_striped_epi8_seqalign_piece1_row_cal(rbeg, base, us, es, qs, ubegs, qprof, gapo1, gape1, gapo2, gape2, W, mov, rh);
else return banded_striped_epi8_seqalign_piece0_row_cal(rbeg, base, us, es, qs, ubegs, qprof, gapo1, gape1, gapo2, gape2, W, mov, rh);
}
static inline int banded_striped_epi8_seqalign_getscore(b1i *us, int *ubegs, u4i W, u8i pos){
u4i i, x, y;
int s;
x = pos % W;
y = pos / W;
s = ubegs[y];
for(i=0;i<=x;i++){
s += us[i * WORDSIZE + y];
}
return s;
}
static inline int banded_striped_epi8_seqalign_mtx_getscore(b1i *ups, b1i *ubs, int *roffs, u4i W, int row, int col){
return banded_striped_epi8_seqalign_getscore(ups + row * (((b8i)W) << WORDSHIFT),
(int*)(ubs + row * roundup_times((WORDSIZE + 1) * sizeof(int), WORDSIZE)), W, (col - roffs[row]));
}
static inline void println_epi32_bsalign(FILE *out, xint v){
int rs[WORDSIZE / 4], i;
mm_store((xint*)rs, v);
for(i=0;i<WORDSIZE/4;i++){
fprintf(out, "%d\t", rs[i]);
}
fprintf(out, "\n");
}
static inline u4i banded_striped_epi8_seqalign_row_max(b1i *us, int *ubegs, u4i W, int *max_score){
xint h, c, xi, Max[4], max[2], Scr[4], scr[2], Idx[4], Pos[4];
u4i i, j, x, y, STEP;
int ary[2][WORDSIZE/4], uscr, umax;
for(i=0;i<4;i++){
Scr[i] = mm_load(((xint*)ubegs) + i);
Max[i] = mm_set1_epi32(SEQALIGN_SCORE_MIN);
Idx[i] = mm_set1_epi32(0);
}
for(i=0;i<WORDSIZE/4;i++){ ary[0][i] = i; } Idx[0] = Pos[0] = mm_load((xint*)ary[0]);
for(i=0;i<WORDSIZE/4;i++){ ary[0][i] += WORDSIZE / 4; } Idx[1] = Pos[1] = mm_load((xint*)ary[0]);
for(i=0;i<WORDSIZE/4;i++){ ary[0][i] += WORDSIZE / 4; } Idx[2] = Pos[2] = mm_load((xint*)ary[0]);
for(i=0;i<WORDSIZE/4;i++){ ary[0][i] += WORDSIZE / 4; } Idx[3] = Pos[3] = mm_load((xint*)ary[0]);
STEP = 32;
for(i=0;i<W;i+=x){
x = num_min(i + STEP, W) - i;
{
max[0] = max[1] = mm_set1_epi16(-(MAX_B2));
scr[0] = scr[1] = mm_set1_epi16(0);
}
for(j=0;j<x;j++){
h = mm_load(((xint*)us) + i + j);
scr[0] = mm_adds_epi16(scr[0], mm_cvtepi8x0_epi16(h));
scr[1] = mm_adds_epi16(scr[1], mm_cvtepi8x1_epi16(h));
max[0] = mm_max_epi16(max[0], scr[0]);
max[1] = mm_max_epi16(max[1], scr[1]);
}
{
h = mm_add_epi32(Scr[0], mm_cvtepi16x0_epi32(max[0])); Idx[0] = mm_blendv(Idx[0], Pos[0], mm_cmpgt_epi32(h, Max[0])); Max[0] = mm_max_epi32(Max[0], h);
h = mm_add_epi32(Scr[1], mm_cvtepi16x1_epi32(max[0])); Idx[1] = mm_blendv(Idx[1], Pos[1], mm_cmpgt_epi32(h, Max[1])); Max[1] = mm_max_epi32(Max[1], h);
h = mm_add_epi32(Scr[2], mm_cvtepi16x0_epi32(max[1])); Idx[2] = mm_blendv(Idx[2], Pos[2], mm_cmpgt_epi32(h, Max[2])); Max[2] = mm_max_epi32(Max[2], h);
h = mm_add_epi32(Scr[3], mm_cvtepi16x1_epi32(max[1])); Idx[3] = mm_blendv(Idx[3], Pos[3], mm_cmpgt_epi32(h, Max[3])); Max[3] = mm_max_epi32(Max[3], h);
Scr[0] = mm_add_epi32(Scr[0], mm_cvtepi16x0_epi32(scr[0]));
Scr[1] = mm_add_epi32(Scr[1], mm_cvtepi16x1_epi32(scr[0]));
Scr[2] = mm_add_epi32(Scr[2], mm_cvtepi16x0_epi32(scr[1]));
Scr[3] = mm_add_epi32(Scr[3], mm_cvtepi16x1_epi32(scr[1]));
xi = mm_set1_epi32(1 << 8);
Pos[0] = mm_add_epi32(Pos[0], xi);
Pos[1] = mm_add_epi32(Pos[1], xi);
Pos[2] = mm_add_epi32(Pos[2], xi);
Pos[3] = mm_add_epi32(Pos[3], xi);
}
}
c = mm_cmpgt_epi32(Max[1], Max[0]); Idx[0] = mm_blendv(Idx[0], Idx[1], c); Max[0] = mm_max_epi32(Max[0], Max[1]);
c = mm_cmpgt_epi32(Max[3], Max[2]); Idx[1] = mm_blendv(Idx[2], Idx[3], c); Max[1] = mm_max_epi32(Max[2], Max[3]);
c = mm_cmpgt_epi32(Max[1], Max[0]); Idx[0] = mm_blendv(Idx[0], Idx[1], c); Max[0] = mm_max_epi32(Max[0], Max[1]);
mm_store(((xint*)ary[0]), Max[0]);
mm_store(((xint*)ary[1]), Idx[0]);
*max_score = ary[0][0];
x = 0;
for(i=1;i<WORDSIZE/4;i++){
if(ary[0][i] > *max_score){
*max_score = ary[0][i];
x = i;
}
}
x = ary[1][x];
i = x & 0xFF;
x = x >> 8;
y = num_min((x + 1) * STEP, W);
x = x * STEP;
j = x;
umax = SEQALIGN_SCORE_MIN; uscr = 0;
for(;x<y;x++){
uscr += us[x * WORDSIZE + i];
if(uscr > umax){
j = x;
umax = uscr;
}
}
x = i * W + j;
#if DEBUG
if(x == W * WORDSIZE){
fflush(stdout); fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
if(0){
int rs[WORDSIZE];
mm_store(((xint*)rs) + 0, Scr[0]);
mm_store(((xint*)rs) + 1, Scr[1]);
mm_store(((xint*)rs) + 2, Scr[2]);
mm_store(((xint*)rs) + 3, Scr[3]);
for(i=1;i<=WORDSIZE;i++){
if(ubegs[i] != rs[i-1]){
fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
}
y = 0; umax = SEQALIGN_SCORE_MIN; uscr = ubegs[0];
for(i=0;i<W*WORDSIZE;i++){
uscr += us[banded_striped_epi8_pos2idx(W * WORDSIZE, i)];
if(uscr > umax){
umax = uscr;
y = i;
}
}
if(y != x && banded_striped_epi8_seqalign_getscore(us, ubegs, W, x) != umax){
fflush(stdout); fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
if(umax != max_score[0]){
fflush(stdout); fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
}
#endif
return x;
}
static inline int banded_striped_epi8_seqalign_band_mov(b1i *us, int *ubegs, u4i W, u4i tidx, u4i qoff, u4i qlen){
u4i i;
int noisy;
UNUSED(us);
if(tidx <= W * WORDSIZE / 4) return 0;
if(qoff + W * WORDSIZE >= qlen) return 0;
noisy = 0;
for(i=1;i<=WORDSIZE;i++){
noisy += num_diff(ubegs[i], ubegs[i-1]);
}
noisy = num_max(2 * WORDSIZE / 2, noisy / WORDSIZE / W * WORDSIZE / 2);
if(ubegs[0] + noisy < ubegs[WORDSIZE]){
return 2;
} else if(ubegs[0] > ubegs[WORDSIZE] + noisy){
return 0;
} else {
return 1;
}
}
static inline u4i banded_striped_epi8_seqalign_band_comb(u4i cnt, u4i *qoffs, int **ubegs, b1v *mempool, u4i W, u4i qlen){
typedef struct { u4i qpos[2]; int sval; } run_t;
run_t *runs, *r2;
int *row, *ary, smax, s;
u4i i, j, xlen, ri, rn, rx, ry, x, y, mpsz[3];
if(W * WORDSIZE >= qlen) return 0; xlen = roundup_times(qlen, WORDSIZE);
mpsz[0] = xlen * sizeof(int); mpsz[1] = mpsz[0] + (rn = cnt * WORDSIZE) * sizeof(run_t); mpsz[2] = mpsz[1] + 2 * rn * sizeof(int);
clear_and_encap_b1v(mempool, mpsz[2]);
row = (int*)(mempool->buffer + 0);
runs = (run_t*)(mempool->buffer + mpsz[0]);
ary = (int*)(mempool->buffer + mpsz[1]);
memset(row, 0x80, xlen * sizeof(int)); for(ri=i=0;i<cnt;i++){
for(j=0;j<W;j++){
runs[ri].qpos[0] = qoffs[i] + j * W;
runs[ri].qpos[1] = qoffs[i] + j * W + W;
runs[ri].sval = (ubegs[i][j] + ubegs[i][j]) / 2;
ri ++;
}
}
sort_array(runs, rn, run_t, num_cmpgt(a.qpos[1], b.qpos[1]));
for(i=0;i<2*rn;i++) ary[i] = i;
sort_array(ary, rn * 2, int, num_cmpgtx(runs[a >> 1].qpos[a & 0x1], runs[b >> 1].qpos[b & 0x1], b & 0x1, a & 0x1));
rx = ry = 0;
x = runs[ary[0] >> 1].qpos[0];
for(ri=0;ri<rn*2;ri++){
if((ary[ri] & 0x01) == 0){
ry = num_max(ary[ri] >> 1, Int(ry));
}
r2 = runs + (ary[ri] >> 1);
y = r2->qpos[0] + (ary[ri] & 0x1) * W;
if(y > x){
smax = - MAX_B4;
while(runs[rx].qpos[1] <= x) rx ++;
for(i=rx;i<=ry;i++){
if(runs[i].qpos[0] > x) continue;
if(runs[i].sval > smax) smax = runs[i].sval;
}
for(i=x;i<y;i++) row[i] = smax;
}
x = y;
}
smax = (int)0x80808080;
rx = MAX_U4;
x = 0;
while(x + W * WORDSIZE <= qlen){
while(x + W * WORDSIZE < qlen && row[x] == (int)0x80808080) x ++;
s = 0;
for(i=0;i<W*WORDSIZE;i++){
if(row[x + i] == (int)0x80808080) break;
s += row[x + i];
}
if(i < W * WORDSIZE){
x = i + 1;
continue;
}
if(s > smax){
smax = s;
rx = x;
}
while(x + W * WORDSIZE < qlen){
if(row[x + W * WORDSIZE] == (int)0x80808080){
x = x + W * WORDSIZE + 1;
break;
}
s -= row[x];
s += row[x + W * WORDSIZE];
x ++;
if(s > smax){
smax = s;
rx = x;
}
}
}
s = 0;
for(i=1;i<WORDSIZE;i++){
s += num_diff(row[rx + i * W], row[rx + (i + 1) * W]);
}
s = num_max(2 * WORDSIZE / 2, s / (WORDSIZE - 1) / W * WORDSIZE / 2);
return s;
}
static inline void banded_striped_epi8_seqalign_row_print(FILE *out, u1i *qseq, u4i qlen, u4i tidx, u4i tpos, u1i tbase, u4i bandwidth, u4i mov, u4i rbeg, u4i rmax, int max_score, int *ubegs, b1i *us, b1i *es, int detail){
u4i i, x, W;
int score;
W = bandwidth / WORDSIZE;
fprintf(out, "ROW[%d][%d][%c]\tMOV=%d\tBAND=%d,%d\tMAX=%d(%d),%d", tidx, tpos, "ACGTN-"[tbase], mov, rbeg, rbeg + bandwidth, rbeg + rmax, rmax, max_score);
if(detail > 2){
score = ubegs[0];
for(i=0;i<bandwidth;i++){
x = banded_striped_epi8_pos2idx(W << WORDSHIFT, i);
fprintf(out, "\t%d:%c%d:%d:%d", i + rbeg, "ACGTN-"[rbeg + i < qlen? qseq[rbeg + i] : 4], score + us[x], us[x], es? es[x] : 0);
score += us[x];
}
}
fprintf(out, "\n");
if(detail > 1){
for(i=0;i<=WORDSIZE;i++){
fprintf(out, "\t%d", ubegs[i]);
}
fprintf(out, "\n");
}
fflush(out);
}
static inline int array_epi32_sum(int *rs, int n){
xint v, s;
int i, m, ary[WORDSIZE / 4], sum;
s = mm_set1_epi32(0);
m = n / (WORDSIZE / 4);
for(i=0;i<m;i++){
v = mm_load(((xint*)rs) + i);
s = mm_add_epi32(s, v);
}
mm_store((xint*)ary, s);
sum = 0;
for(i=0;i<WORDSIZE/4;i++) sum += ary[i];
for(i=m*(WORDSIZE/4);i<n;i++){
sum += rs[i];
}
return sum;
}
static inline int banded_striped_epi8_seqalign_piecex_row_verify(int rowidx, int rowoff, int mode, int W, int mov, u1i tbase, b1i *us[2], b1i *es[2], b1i *qs[2], int *ubegs[2], b1i *qprof, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2){
int i, x1, x2, s1, s2, s, h, u, e, q, f, g, piecewise;
piecewise = banded_striped_epi8_seqalign_get_piecewise(gapo1, gape1, gapo2, gape2, W * WORDSIZE);
s1 = ubegs[0][0];
s2 = ubegs[1][0];
f = g = SEQALIGN_SCORE_MIN;
if(mov == 0){
if(rowoff) s = SEQALIGN_SCORE_MIN;
else if(seqalign_mode_type(mode) == SEQALIGN_MODE_OVERLAP || rowidx == 0) s = 0;
else if(piecewise < 2) s = gapo1 + gape1 * rowidx;
else s = num_max(gapo1 + gape1 * rowidx, gapo2 + gape2 * rowidx);
s += banded_striped_epi8_seqalign_get_qprof_value(qprof, rowoff, tbase);
s -= s1;
} else if(mov >= W * WORDSIZE){
return 1;
} else {
for(i=0;i<mov&&i<W*WORDSIZE;i++){
x1 = banded_striped_epi8_pos2idx(W << WORDSHIFT, i);
u = us[0][x1];
s1 += u;
if(((i + 1) % W) == 0){
if(ubegs[0][(i + 1) / W] != s1){
fflush(stdout); fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
}
}
s = banded_striped_epi8_seqalign_get_qprof_value(qprof, rowoff + mov, tbase);
}
for(i=mov;i<W*WORDSIZE;i++){
x1 = banded_striped_epi8_pos2idx(W << WORDSHIFT, i);
u = us[0][x1];
e = (piecewise == 0)? gape1 : es[0][x1];
q = (piecewise == 2)? qs[0][x1] : SEQALIGN_SCORE_MIN;
h = num_max(s, u + e);
h = num_max(h, u + q);
h += s1;
h = num_max(h, f);
h = num_max(h, g);
x2 = banded_striped_epi8_pos2idx(W << WORDSHIFT, i - mov);
if(h > SEQALIGN_SCORE_MIN / 2 && h != s2 + us[1][x2]){
fflush(stdout); fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
f = (piecewise == 0)? h + gape1 : num_max(f + gape1, h + gapo1 + gape1);
g = (piecewise == 2)? num_max(g + gape2, h + gapo2 + gape2) : SEQALIGN_SCORE_MIN;
if(piecewise > 0 && h > SEQALIGN_SCORE_MIN / 2 && h + es[1][x2] != num_max(s1 + es[0][x1] + us[0][x1] + gape1, h + gapo1 + gape1)){
fflush(stdout); fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
if(piecewise == 2 && h > SEQALIGN_SCORE_MIN / 2 && h + qs[1][x2] != num_max(s1 + qs[0][x1] + us[0][x1] + gape2, h + gapo2 + gape2)){
fflush(stdout); fprintf(stderr, " -- i=%d q:%d!=%d in %s -- %s:%d --\n", i, num_max(s1 + qs[0][x1] + us[0][x1] + gape2, h + gapo2 + gape2) - h, qs[1][x2], __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
s1 += us[0][x1];
s2 += us[1][x2];
if(((i + 1) % W) == 0 && ubegs[0][(i + 1) / W] != s1){
fflush(stdout); fprintf(stderr, " -- something wrong i = %d in %s -- %s:%d --\n", i, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
if(((i - mov + 1) % W) == 0 && ubegs[1][(i - mov + 1) / W] != s2){
fflush(stdout); fprintf(stderr, " -- something wrong i = %d in %s -- %s:%d --\n", i, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
s = banded_striped_epi8_seqalign_get_qprof_value(qprof, rowoff + i + 1, tbase);
}
u = SEQALIGN_SCORE_EPI8_MIN;
for(;i<W*WORDSIZE+mov;i++){
h = num_max(s1 + s, f);
h = num_max(h, g);
x2 = banded_striped_epi8_pos2idx(W << WORDSHIFT, i - mov);
if(h > SEQALIGN_SCORE_MIN / 2 && h != s2 + us[1][x2]){
fflush(stdout); fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
f = (piecewise == 0)? h + gape1 : num_max(f + gape1, h + gapo1 + gape1);
g = (piecewise == 2)? num_max(g + gape2, h + gapo2 + gape2) : SEQALIGN_SCORE_MIN;
if(piecewise > 0 && h > SEQALIGN_SCORE_MIN / 2 && h + es[1][x2] != h + gapo1 + gape1){
fflush(stdout); fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
if(piecewise == 2 && h > SEQALIGN_SCORE_MIN / 2 && h + qs[1][x2] != h + gapo2 + gape2){
fflush(stdout); fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
s1 += u;
s2 += us[1][x2];
if(((i - mov + 1) % W) == 0 && ubegs[1][(i - mov + 1) / W] != s2){
fflush(stdout); fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
u = 0;
s = SEQALIGN_SCORE_MIN;
}
return 0;
}
static inline u4i banded_striped_epi8_seqalign_piecex_backtrace(u1i *qseq, u1i *tseq, b1i *bs, int *begs, int mode, u4i bandwidth, int piecewise, seqalign_result_t *rs, u4v *cigars){
u4i btx, bty, cg, cgz;
u1i qbase, tbase, op, bt;
rs->qb = rs->qe; rs->qe ++;
rs->tb = rs->te; rs->te ++;
rs->mat = rs->mis = rs->ins = rs->del = rs->aln = 0;
bty = bt = cg = cgz = 0;
if(cigars){
if(mode & SEQALIGN_MODE_CIGRESV){
cgz = cigars->size;
} else {
clear_u4v(cigars);
}
}
while(1){
rs->aln ++;
btx = bs[Int64(rs->tb) * bandwidth + banded_striped_epi8_pos2idx(bandwidth, rs->qb - begs[rs->tb])];
if(piecewise == 0){
bt = bty = btx;
} else if(piecewise == 1){
if(bty == SEQALIGN_BT_M){
bt = bty = btx & 0x3;
} else {
bt = bty = ((btx >> (bt + 1)) & 0x1)? bty : SEQALIGN_BT_M;
}
} else {
if(bty == SEQALIGN_BT_M){
bty = btx & 0xf;
} else {
bty = ((btx >> (bt + 2)) & 0x1)? bty : SEQALIGN_BT_M;
}
bt = (0b1001100100 >> (bty * 2)) & 0x3;
}
if(bt == SEQALIGN_BT_M){
if(qseq && tseq){
qbase = qseq[rs->qb];
tbase = tseq[rs->tb];
if(qbase == tbase){
rs->mat ++;
op = 7; } else {
rs->mis ++;
op = 8; }
} else {
rs->mat ++;
op = 0; }
rs->qb --;
rs->tb --;
if(rs->qb < 0 || rs->tb < 0) break;
} else if(bt == SEQALIGN_BT_D){
rs->del ++;
rs->tb --;
op = 2; if(rs->tb < 0) break;
} else {
rs->ins ++;
rs->qb --;
op = 1; if(rs->qb < 0) break;
}
if(op == (cg & 0xf)){
cg += 0x10;
} else {
if(cg && cigars) push_u4v(cigars, cg);
cg = 0x10 | op;
}
}
if(op == (cg & 0xf)){
cg += 0x10;
if(cigars) push_u4v(cigars, cg);
} else {
if(cg && cigars) push_u4v(cigars, cg);
cg = 0x10 | op;
if(cigars) push_u4v(cigars, cg);
}
rs->qb ++;
rs->tb ++;
if(cigars) sub_reverse_u4v(cigars, cgz, cigars->size);
return rs->aln;
}
static inline int banded_striped_epi8_seqalign_piecex_backcal_cell(int x, u1i qbase, u1i tbase, int Hs[2], b1i u, b1i e, b1i q, u4i W, b1i *matrix, int piecewise, int prior_match){
int s, h;
s = matrix[qbase * 4 + tbase];
h = Hs[1] - Hs[0];
if(x > Int(W * WORDSIZE)){ return SEQALIGN_BT_I;
} else if(x == Int(W * WORDSIZE)){
if(h == s){
return SEQALIGN_BT_M;
} else {
return SEQALIGN_BT_I;
}
} else if(prior_match){
if(h == s){
return SEQALIGN_BT_M;
} else {
if(h == u + e){
return SEQALIGN_BT_D;
} else if(piecewise == 2 && h == u + q){
return SEQALIGN_BT2_D2;
} else {
return SEQALIGN_BT_I;
}
}
} else {
if(h == u + e){
return SEQALIGN_BT_D;
} else if(piecewise == 2 && h == u + q){
return SEQALIGN_BT2_D2;
} else if(h == s){
return SEQALIGN_BT_M;
} else {
return SEQALIGN_BT_I;
}
}
}
static inline u4i banded_striped_epi8_seqalign_piecex_backcal(u1i *qseq, u1i *tseq, b1i *ups, b1i *eps, b1i *qps, b1i *ubs, int *roffs, int mode, u4i bandwidth, b1i *matrix, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2, seqalign_result_t *rs, u4v *cigars){
u4i bt, cg, cgz, op;
b8i t;
int piecewise, prior_match, W, sz, Hs[3];
piecewise = banded_striped_epi8_seqalign_get_piecewise(gapo1, gape1, gapo2, gape2, bandwidth);
rs->qb = rs->qe; rs->qe ++;
rs->tb = rs->te; rs->te ++;
rs->mat = rs->mis = rs->ins = rs->del = rs->aln = 0;
cg = cgz = 0;
if(cigars){
if(mode & SEQALIGN_MODE_CIGRESV){
cgz = cigars->size;
} else {
clear_u4v(cigars);
}
}
W = bandwidth / WORDSIZE;
Hs[0] = 0; Hs[1] = banded_striped_epi8_seqalign_mtx_getscore(ups, ubs, roffs, W, rs->tb, rs->qb);
Hs[2] = 0; prior_match = 0;
while(1){
if((Hs[2] & 0xf) == SEQALIGN_BT_D){ Hs[0] = banded_striped_epi8_seqalign_mtx_getscore(ups, ubs, roffs, W, rs->tb, rs->qb);
t = gapo1 + (Hs[2] >> 4) * gape1;
if(Hs[0] + t == Hs[1]){
cg = _push_cigar_bsalign(cigars, cg, SEQALIGN_BT_D, Hs[2] >> 4);
rs->del += Hs[2] >> 4;
rs->aln += Hs[2] >> 4;
Hs[1] = Hs[0];
Hs[2] = 0;
} else {
Hs[2] += 1 << 4;
rs->tb --;
continue;
}
} else if((Hs[2] & 0xf) == SEQALIGN_BT2_D2){ Hs[0] = banded_striped_epi8_seqalign_mtx_getscore(ups, ubs, roffs, W, rs->tb, rs->qb);
t = gapo2 + (Hs[2] >> 4) * gape2;
if(Hs[0] + t == Hs[1]){
cg = _push_cigar_bsalign(cigars, cg, SEQALIGN_BT_D, Hs[2] >> 4);
rs->del += Hs[2] >> 4;
rs->aln += Hs[2] >> 4;
Hs[1] = Hs[0];
Hs[2] = 0;
} else {
Hs[2] += 1 << 4;
rs->tb --;
continue;
}
}
if(rs->qb < 0 || rs->tb < 0){
break;
}
if(rs->qb == roffs[rs->tb - 1]){
if(rs->qb){
Hs[0] = ((int*)(ubs + (rs->tb - 1) * roundup_times((WORDSIZE + 1) * sizeof(int), WORDSIZE)))[0];
prior_match = 0; } else {
if(seqalign_mode_type(mode) == SEQALIGN_MODE_OVERLAP || rs->tb == 0) Hs[0] = 0;
else if(piecewise < 2) Hs[0] = gapo1 + gape1 * rs->tb;
else Hs[0] = num_max(gapo1 + gape1 * rs->tb, gapo2 + gape2 * rs->tb);
}
} else {
Hs[0] = banded_striped_epi8_seqalign_mtx_getscore(ups, ubs, roffs, W, rs->tb - 1, rs->qb - 1);
}
t = rs->qb - roffs[rs->tb - 1];
t = Int64(rs->tb - 1) * bandwidth + (t % W) * WORDSIZE + (t / W);
bt = banded_striped_epi8_seqalign_piecex_backcal_cell(rs->qb - roffs[rs->tb - 1], qseq[rs->qb], tseq[rs->tb], Hs, ups[t], eps? eps[t] : gapo1 + gape1, qps? qps[t] : 0, W, matrix, piecewise, prior_match);
prior_match = 1;
if(bt == SEQALIGN_BT_M){
if(qseq[rs->qb] == tseq[rs->tb]){
rs->mat ++;
} else {
rs->mis ++;
}
rs->qb --;
rs->tb --;
rs->aln ++;
cg = _push_cigar_bsalign(cigars, cg, 0, 1);
Hs[1] = Hs[0];
} else if(bt == SEQALIGN_BT_I){
if(rs->qb <= 0){
cg = _push_cigar_bsalign(cigars, cg, 1, 1);
Hs[1] = Hs[0];
rs->qb --;
rs->ins ++;
rs->aln ++;
} else {
for(sz=1;Int(sz)+roffs[rs->tb]<=rs->qb;sz++){
if(piecewise == 2){
t = num_max(gapo1 + sz * gape1, gapo2 + sz * gape2);
} else {
t = gapo1 + sz * gape1;
}
Hs[0] = banded_striped_epi8_seqalign_mtx_getscore(ups, ubs, roffs, W, rs->tb, rs->qb - sz);
if(Hs[0] + t == Hs[1]){
cg = _push_cigar_bsalign(cigars, cg, 1, sz);
Hs[1] = Hs[0];
rs->qb -= sz;
rs->ins += sz;
rs->aln += sz;
break;
}
}
}
} else {
Hs[2] = (1 << 4) | bt;
rs->tb --;
continue;
}
}
if(seqalign_mode_type(mode) == SEQALIGN_MODE_OVERLAP){
if(cg && cigars){
push_u4v(cigars, cg);
}
} else {
if(rs->qb >= 0){
op = 1;
sz = rs->qb + 1;
rs->ins += sz;
rs->qb = -1;
} else if(rs->tb >= 0){
op = 2;
sz = rs->tb + 1;
rs->del += sz;
rs->tb = -1;
} else {
op = sz = 0;
}
rs->aln += sz;
cg = _push_cigar_bsalign(cigars, cg, op, sz);
if(cg && cigars){
push_u4v(cigars, cg);
}
}
rs->qb ++;
rs->tb ++;
if(cigars) sub_reverse_u4v(cigars, cgz, cigars->size);
return rs->aln;
}
static inline seqalign_result_t banded_striped_epi8_seqalign_pairwise(u1i *qseq, u4i qlen, u1i *tseq, u4i tlen, b1v *mempool, u4v *cigars, int mode, u4i bandwidth, b1i matrix[16], b1i gapo1, b1i gape1, b1i gapo2, b1i gape2, int verbose){
seqalign_result_t rs;
b1i *memp, *mempb, *qprof, *us[2], *es[2], *qs[2], *ups, *eps, *qps, *ubs;
u8i mpsize;
u4i i, W, rbeg, rmax, mov;
int piecewise, smax, smin, max_score, score, rh, rbx, rby, rbz, *begs, *ubegs[2];
u1i tbase;
if(bandwidth == 0) bandwidth = qlen;
bandwidth = roundup_times(bandwidth, WORDSIZE);
W = bandwidth / WORDSIZE;
piecewise = banded_striped_epi8_seqalign_get_piecewise(gapo1, gape1, gapo2, gape2, bandwidth);
if(verbose){
fprintf(stdout, "[%d,%d][%d,%d] PIECEWISE=%d\tW=%d\n", gapo1, gape1, gapo2, gape2, piecewise, W);
}
smax = - MAX_B1;;
smin = MAX_B1;
for(i=0;i<16;i++){
smax = num_max(smax, matrix[i]);
smin = num_min(smin, matrix[i]);
}
mpsize = WORDSIZE;
mpsize += banded_striped_epi8_seqalign_qprof_size(qlen, bandwidth); mpsize += ((u8i)bandwidth) * ((tlen + 1) * (piecewise + 1)); mpsize += (tlen + 2) * roundup_times((WORDSIZE + 1) * sizeof(int), WORDSIZE); mpsize += bandwidth * ((piecewise + 1)); mpsize += roundup_times((tlen + 1) * sizeof(int), WORDSIZE); if(mempool){
if(mempool->aligned < WORDSIZE){
fflush(stdout); fprintf(stderr, " -- mempool should be aligned by (%d) but (%d) bytes in %s -- %s:%d --\n", WORDSIZE, mempool->aligned, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
if(mempool->size % WORDSIZE){
inc_b1v(mempool, roundup_times(mempool->size, WORDSIZE) - mempool->size);
}
encap_b1v(mempool, mpsize);
memp = mempool->buffer + mempool->size + WORDSIZE;
mempb = NULL;
} else {
mempb = malloc(mpsize);
memp = mempb + WORDSIZE;
}
qprof = memp; memp += banded_striped_epi8_seqalign_qprof_size(qlen, bandwidth);
ups = memp + bandwidth; memp += bandwidth * Int64(tlen + 1);
if(piecewise){
eps = memp + bandwidth; memp += bandwidth * Int64(tlen + 1);
} else eps = NULL;
if(piecewise == 2){
qps = memp + bandwidth; memp += bandwidth * Int64(tlen + 1);
} else qps = NULL;
ubs = memp + roundup_times((WORDSIZE + 1) * sizeof(int), WORDSIZE); memp += (tlen + 1) * roundup_times((WORDSIZE + 1) * sizeof(int), WORDSIZE);
ubegs[0] = (int*)memp; memp += 1 * roundup_times((WORDSIZE + 1) * sizeof(int), WORDSIZE);
us[0] = memp; memp += bandwidth;
if(piecewise){
es[0] = memp; memp += bandwidth;
} else es[0] = NULL;
if(piecewise == 2){
qs[0] = memp; memp += bandwidth;
} else qs[0] = NULL;
begs = ((int*)memp) + 1; memp += roundup_times((tlen + 1) * sizeof(u4i), WORDSIZE);
if(mode & SEQALIGN_MODE_QPROF){
} else {
banded_striped_epi8_seqalign_set_query_prof(qseq, qlen, qprof, bandwidth, matrix);
}
memset(&rs, 0, sizeof(seqalign_result_t));
rs.score = SEQALIGN_SCORE_MIN;
us[1] = ups - bandwidth;
ubegs[1] = (int*)(ubs - roundup_times((WORDSIZE + 1) * sizeof(int), WORDSIZE));
es[1] = eps? eps - bandwidth : NULL;
qs[1] = qps? qps - bandwidth : NULL;
banded_striped_epi8_seqalign_piecex_row_init(us[1], es[1], qs[1], ubegs[1], begs - 1, mode, bandwidth, smax, smin, gapo1, gape1, gapo2, gape2);
rbeg = rmax = 0;
mov = 0;
for(i=0;i<tlen;i++){
tbase = tseq[i];
if(mov && rbeg + bandwidth < qlen){
mov = num_min(mov, num_max(0, Int(qlen) - Int(rbeg + bandwidth)));
rbeg += mov;
rh = banded_striped_epi8_seqalign_getscore(us[1], ubegs[1], W, mov - 1);
} else {
mov = 0;
if(rbeg){
rh = SEQALIGN_SCORE_MIN;
} else {
if(seqalign_mode_type(mode) == SEQALIGN_MODE_OVERLAP || i == 0) rh = 0;
else if(piecewise < 2) rh = gapo1 + gape1 * i;
else rh = num_max(gapo1 + gape1 * i, gapo2 + gape2 * i);
}
}
banded_striped_epi8_seqalign_piecex_row_movx(us, es, qs, ubegs, W, mov, piecewise, smax, smin, gapo1, gape1, gapo2, gape2); #if 0#endif
us[1] = ups + ((u8i)bandwidth) * i;
es[1] = eps? eps + ((u8i)bandwidth) * i : NULL;
qs[1] = qps? qps + ((u8i)bandwidth) * i : NULL;
ubegs[1] = (int*)(ubs + roundup_times((WORDSIZE + 1) * sizeof(int), WORDSIZE) * i);
__builtin_prefetch(us[1], 1);
banded_striped_epi8_seqalign_piecex_row_cal(rbeg, tbase, us, es, qs, ubegs, qprof, gapo1, gape1, gapo2, gape2, W, mov, rh, piecewise);
if(verbose){
rmax = banded_striped_epi8_seqalign_row_max(us[1], ubegs[1], W, &max_score);
banded_striped_epi8_seqalign_row_print(stdout, qseq, qlen, 1, i, tbase, bandwidth, mov, rbeg, rmax, max_score, ubegs[1], us[1], es[1], verbose);
}
if(verbose > 3){
b1i *tus[2], *tes[2], *tqs[2];
int *tubegs[2];
tus[0] = ups + ((u8i)bandwidth) * (Int(i) - 1);
tes[0] = eps? eps + ((u8i)bandwidth) * (Int(i) - 1) : NULL;
tqs[0] = qps? qps + ((u8i)bandwidth) * (Int(i) - 1) : NULL;
tubegs[0] = (int*)(ubs + roundup_times((WORDSIZE + 1) * sizeof(int), WORDSIZE) * (Int(i) - 1));
tus[1] = us[1];
tes[1] = es[1];
tqs[1] = qs[1];
tubegs[1] = ubegs[1];
banded_striped_epi8_seqalign_piecex_row_verify(i, rbeg - mov, mode, W, mov, tbase, tus, tes, tqs, tubegs, qprof, gapo1, gape1, gapo2, gape2);
}
rbx = banded_striped_epi8_seqalign_band_mov(us[1], ubegs[1], W, i, rbeg, qlen);
if(seqalign_mode_type(mode) == SEQALIGN_MODE_GLOBAL){
rbz = 2 * num_max(Int(tlen / qlen), 1); rby = Int((1.0 * i / tlen) * qlen); if(rbeg + rbz * Int64(tlen - i - 1) + bandwidth <= (qlen + rbz - 1)){ mov = 1 + ((qlen - (rbeg + bandwidth)) / num_max(1, tlen - i - 1));
} else if(Int(rbeg) < rby - Int(bandwidth)){
mov = rbx + 1;
} else if(Int(rbeg) > rby){
mov = num_max(0, rbx - 1);
} else {
mov = rbx;
}
} else {
mov = rbx;
}
begs[i] = rbeg;
if(seqalign_mode_type(mode) != SEQALIGN_MODE_GLOBAL){
if(rbeg + bandwidth >= qlen){
score = banded_striped_epi8_seqalign_getscore(us[1], ubegs[1], W, qlen - 1 - rbeg);
if(score > rs.score){
rs.score = score;
rs.qe = qlen - 1;
rs.te = i;
}
}
}
}
if(seqalign_mode_type(mode) == SEQALIGN_MODE_GLOBAL){
rs.score = banded_striped_epi8_seqalign_getscore(us[1], ubegs[1], W, qlen - 1 - rbeg);
rs.qe = qlen - 1;
rs.te = tlen - 1;
} else {
rmax = banded_striped_epi8_seqalign_row_max(us[1], ubegs[1], W, &max_score);
if(max_score > rs.score){
rs.score = max_score;
rs.qe = rbeg + rmax;
rs.te = tlen - 1;
}
}
banded_striped_epi8_seqalign_piecex_backcal(qseq, tseq, ups, eps, qps, ubs, begs, mode, bandwidth, matrix, gapo1, gape1, gapo2, gape2, &rs, cigars);
if(mempb) free(mempb);
return rs;
}
#endif