#define HTS_BUILDING_LIBRARY
#include <config.h>
#include "htslib/sam.h"
#include "textutils_internal.h"
#define MAX_BASE_MOD 256
struct hts_base_mod_state {
int type[MAX_BASE_MOD]; int canonical[MAX_BASE_MOD]; char strand[MAX_BASE_MOD]; int MMcount[MAX_BASE_MOD]; char *MM[MAX_BASE_MOD]; char *MMend[MAX_BASE_MOD]; uint8_t *ML[MAX_BASE_MOD]; int MLstride[MAX_BASE_MOD]; int implicit[MAX_BASE_MOD]; int seq_pos; int nmods; uint32_t flags; };
hts_base_mod_state *hts_base_mod_state_alloc(void) {
return calloc(1, sizeof(hts_base_mod_state));
}
void hts_base_mod_state_free(hts_base_mod_state *state) {
free(state);
}
static void seq_freq(const bam1_t *b, int freq[16]) {
int i;
memset(freq, 0, 16*sizeof(*freq));
uint8_t *seq = bam_get_seq(b);
for (i = 0; i < b->core.l_qseq; i++)
freq[bam_seqi(seq, i)]++;
freq[15] = b->core.l_qseq; }
static int seqi_rc[] = { 0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15 };
int bam_parse_basemod2(const bam1_t *b, hts_base_mod_state *state,
uint32_t flags) {
state->seq_pos = 0;
state->nmods = 0;
state->flags = flags;
uint8_t *mm = bam_aux_get(b, "MM");
if (!mm) mm = bam_aux_get(b, "Mm");
if (!mm)
return 0;
if (mm[0] != 'Z') {
hts_log_error("%s: MM tag is not of type Z", bam_get_qname(b));
return -1;
}
uint8_t *mi = bam_aux_get(b, "MN");
if (mi && bam_aux2i(mi) != b->core.l_qseq) {
hts_log_error("%s: MM/MN data length is incompatible with"
" SEQ length", bam_get_qname(b));
return -1;
}
uint8_t *ml = bam_aux_get(b, "ML");
if (!ml) ml = bam_aux_get(b, "Ml");
if (ml && (ml[0] != 'B' || ml[1] != 'C')) {
hts_log_error("%s: ML tag is not of type B,C", bam_get_qname(b));
return -1;
}
uint8_t *ml_end = ml ? ml+6 + le_to_u32(ml+2) : NULL;
if (ml) ml += 6;
int freq[16];
if (b->core.flag & BAM_FREVERSE)
seq_freq(b, freq);
char *cp = (char *)mm+1;
int mod_num = 0;
int implicit = 1;
while (*cp) {
for (; *cp; cp++) {
unsigned char btype = *cp++;
if (btype != 'A' && btype != 'C' &&
btype != 'G' && btype != 'T' &&
btype != 'U' && btype != 'N')
return -1;
if (btype == 'U') btype = 'T';
btype = seq_nt16_table[btype];
if (*cp != '+' && *cp != '-')
return -1; char strand = *cp++;
char *ms = cp, *me; char *cp_end = NULL;
int chebi = 0;
if (isdigit_c(*cp)) {
chebi = strtol(cp, &cp_end, 10);
cp = cp_end;
ms = cp-1;
} else {
while (*cp && isalpha_c(*cp))
cp++;
if (*cp == '\0')
return -1;
}
me = cp;
implicit = 1;
if (*cp == '.') {
cp++;
} else if (*cp == '?') {
implicit = 0;
cp++;
} else if (*cp != ',' && *cp != ';') {
return -1;
}
long delta;
int n = 0; int stride = me-ms;
int ndelta = 0;
if (b->core.flag & BAM_FREVERSE) {
int total_seq = 0;
for (;;) {
cp += (*cp == ',');
if (*cp == 0 || *cp == ';')
break;
delta = strtol(cp, &cp_end, 10);
if (cp_end == cp) {
hts_log_error("%s: Hit end of MM tag. Missing "
"semicolon?", bam_get_qname(b));
return -1;
}
cp = cp_end;
total_seq += delta+1;
ndelta++;
}
delta = freq[seqi_rc[btype]] - total_seq; } else {
delta = *cp == ','
? strtol(cp+1, &cp_end, 10)
: 0;
if (!cp_end) {
delta = INT_MAX;
cp_end = cp+1;
}
}
while (ms < me) {
state->type [mod_num] = chebi ? -chebi : *ms;
state->strand [mod_num] = (strand == '-');
state->canonical[mod_num] = btype;
state->MLstride [mod_num] = stride;
state->implicit [mod_num] = implicit;
if (delta < 0) {
hts_log_error("%s: MM tag refers to bases beyond sequence "
"length", bam_get_qname(b));
return -1;
}
state->MMcount [mod_num] = delta;
if (b->core.flag & BAM_FREVERSE) {
state->MM [mod_num] = cp+1;
state->MMend[mod_num] = cp_end;
state->ML [mod_num] = ml ? ml+n +(ndelta-1)*stride: NULL;
} else {
state->MM [mod_num] = cp_end;
state->MMend[mod_num] = NULL;
state->ML [mod_num] = ml ? ml+n : NULL;
}
if (++mod_num >= MAX_BASE_MOD) {
hts_log_error("%s: Too many base modification types",
bam_get_qname(b));
return -1;
}
ms++; n++;
}
if (ml) {
if (b->core.flag & BAM_FREVERSE) {
ml += ndelta*stride;
} else {
while (*cp && *cp != ';') {
if (*cp == ',')
ml+=stride;
cp++;
}
}
if (ml > ml_end) {
hts_log_error("%s: Insufficient number of entries in ML "
"tag", bam_get_qname(b));
return -1;
}
} else {
if (cp_end && (b->core.flag & BAM_FREVERSE))
cp = cp_end;
else
while (*cp && *cp != ';')
cp++;
}
if (!*cp) {
hts_log_error("%s: Hit end of MM tag. Missing semicolon?",
bam_get_qname(b));
return -1;
}
}
}
state->nmods = mod_num;
return 0;
}
int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) {
return bam_parse_basemod2(b, state, 0);
}
int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state,
hts_base_mod *mods, int n_mods) {
if (b->core.flag & BAM_FREVERSE) {
if (state->seq_pos < 0)
return -1;
} else {
if (state->seq_pos >= b->core.l_qseq)
return -1;
}
int i, j, n = 0;
unsigned char base = bam_seqi(bam_get_seq(b), state->seq_pos);
state->seq_pos++;
if (b->core.flag & BAM_FREVERSE)
base = seqi_rc[base];
for (i = 0; i < state->nmods; i++) {
int unchecked = 0;
if (state->canonical[i] != base && state->canonical[i] != 15)
continue;
if (state->MMcount[i]-- > 0) {
if (!state->implicit[i] &&
(state->flags & HTS_MOD_REPORT_UNCHECKED))
unchecked = 1;
else
continue;
}
char *MMptr = state->MM[i];
if (n < n_mods) {
mods[n].modified_base = state->type[i];
mods[n].canonical_base = seq_nt16_str[state->canonical[i]];
mods[n].strand = state->strand[i];
mods[n].qual = unchecked
? HTS_MOD_UNCHECKED
: (state->ML[i] ? *state->ML[i] : HTS_MOD_UNKNOWN);
}
n++;
if (unchecked)
continue;
if (state->ML[i])
state->ML[i] += (b->core.flag & BAM_FREVERSE)
? -state->MLstride[i]
: +state->MLstride[i];
if (b->core.flag & BAM_FREVERSE) {
char *cp;
for (cp = state->MMend[i]-1; cp != state->MM[i]; cp--)
if (*cp == ',')
break;
state->MMend[i] = cp;
if (cp != state->MM[i])
state->MMcount[i] = strtol(cp+1, NULL, 10);
else
state->MMcount[i] = INT_MAX;
} else {
if (*state->MM[i] == ',')
state->MMcount[i] = strtol(state->MM[i]+1, &state->MM[i], 10);
else
state->MMcount[i] = INT_MAX;
}
for (j=i+1; j < state->nmods && state->MM[j] == MMptr; j++) {
if (n < n_mods) {
mods[n].modified_base = state->type[j];
mods[n].canonical_base = seq_nt16_str[state->canonical[j]];
mods[n].strand = state->strand[j];
mods[n].qual = state->ML[j] ? *state->ML[j] : -1;
}
n++;
state->MMcount[j] = state->MMcount[i];
state->MM[j] = state->MM[i];
if (state->ML[j])
state->ML[j] += (b->core.flag & BAM_FREVERSE)
? -state->MLstride[j]
: +state->MLstride[j];
}
i = j-1;
}
return n;
}
int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state,
hts_base_mod *mods, int n_mods, int *pos) {
if (state->seq_pos >= b->core.l_qseq)
return 0;
int next[16], freq[16] = {0}, i;
memset(next, 0x7f, 16*sizeof(*next));
const int unchecked = state->flags & HTS_MOD_REPORT_UNCHECKED;
if (b->core.flag & BAM_FREVERSE) {
for (i = 0; i < state->nmods; i++) {
if (unchecked && !state->implicit[i])
next[seqi_rc[state->canonical[i]]] = 1;
else if (next[seqi_rc[state->canonical[i]]] > state->MMcount[i])
next[seqi_rc[state->canonical[i]]] = state->MMcount[i];
}
} else {
for (i = 0; i < state->nmods; i++) {
if (unchecked && !state->implicit[i])
next[state->canonical[i]] = 0;
else if (next[state->canonical[i]] > state->MMcount[i])
next[state->canonical[i]] = state->MMcount[i];
}
}
for (i = state->seq_pos; i < b->core.l_qseq; i++) {
unsigned char bc = bam_seqi(bam_get_seq(b), i);
if (next[bc] <= freq[bc] || next[15] <= freq[15])
break;
freq[bc]++;
if (bc != 15) freq[15]++;
}
*pos = state->seq_pos = i;
if (i >= b->core.l_qseq) {
for (i = 0; i < state->nmods; i++) {
if (!(b->core.flag & BAM_FREVERSE) &&
state->MMcount[i] < 0x7f000000) {
hts_log_warning("MM tag refers to bases beyond sequence length");
return -1;
}
}
return 0;
}
if (b->core.flag & BAM_FREVERSE) {
for (i = 0; i < state->nmods; i++)
state->MMcount[i] -= freq[seqi_rc[state->canonical[i]]];
} else {
for (i = 0; i < state->nmods; i++)
state->MMcount[i] -= freq[state->canonical[i]];
}
int r = bam_mods_at_next_pos(b, state, mods, n_mods);
return r > 0 ? r : 0;
}
int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state,
hts_base_mod *mods, int n_mods) {
int r = 0;
while (state->seq_pos <= qpos)
if ((r = bam_mods_at_next_pos(b, state, mods, n_mods)) < 0)
break;
return r;
}
int *bam_mods_recorded(hts_base_mod_state *state, int *ntype) {
*ntype = state->nmods;
return state->type;
}
int bam_mods_query_type(hts_base_mod_state *state, int code,
int *strand, int *implicit, char *canonical) {
int i;
for (i = 0; i < state->nmods; i++) {
if (state->type[i] == code)
break;
}
if (i == state->nmods)
return -1;
if (strand) *strand = state->strand[i];
if (implicit) *implicit = state->implicit[i];
if (canonical) *canonical = "?AC?G???T??????N"[state->canonical[i]];
return 0;
}
int bam_mods_queryi(hts_base_mod_state *state, int i,
int *strand, int *implicit, char *canonical) {
if (i < 0 || i >= state->nmods)
return -1;
if (strand) *strand = state->strand[i];
if (implicit) *implicit = state->implicit[i];
if (canonical) *canonical = "?AC?G???T??????N"[state->canonical[i]];
return 0;
}