#define HTS_BUILDING_LIBRARY
#include <config.h>
#include <stdio.h>
#include <errno.h>
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <zlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <math.h>
#include <inttypes.h>
#include <ctype.h>
#include "cram.h"
#include "os.h"
#include "../sam_internal.h"
#include "../htslib/hts.h"
#include "../htslib/hts_endian.h"
#include "../textutils_internal.h"
KHASH_MAP_INIT_STR(m_s2u64, uint64_t)
#define Z_CRAM_STRAT Z_FILTERED
static int process_one_read(cram_fd *fd, cram_container *c,
cram_slice *s, cram_record *cr,
bam_seq_t *b, int rnum, kstring_t *MD,
int embed_ref, int no_ref);
static int sub_idx(char *key, char val) {
int i;
for (i = 0; i < 4 && *key++ != val; i++);
return i;
}
cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
cram_block_compression_hdr *h,
int embed_ref) {
cram_block *cb = cram_new_block(COMPRESSION_HEADER, 0);
cram_block *map = cram_new_block(COMPRESSION_HEADER, 0);
int i, mc, r = 0;
int no_ref = c->no_ref;
if (!cb || !map)
return NULL;
if (CRAM_MAJOR_VERS(fd->version) == 1) {
r |= itf8_put_blk(cb, h->ref_seq_id);
r |= itf8_put_blk(cb, h->ref_seq_start);
r |= itf8_put_blk(cb, h->ref_seq_span);
r |= itf8_put_blk(cb, h->num_records);
r |= itf8_put_blk(cb, h->num_landmarks);
for (i = 0; i < h->num_landmarks; i++) {
r |= itf8_put_blk(cb, h->landmark[i]);
}
}
if (h->preservation_map) {
kh_destroy(map, h->preservation_map);
h->preservation_map = NULL;
}
if (c->num_records > 0) {
khint_t k;
int r;
if (!(h->preservation_map = kh_init(map)))
return NULL;
k = kh_put(map, h->preservation_map, "RN", &r);
if (-1 == r) return NULL;
kh_val(h->preservation_map, k).i = !fd->lossy_read_names;
if (CRAM_MAJOR_VERS(fd->version) == 1) {
k = kh_put(map, h->preservation_map, "PI", &r);
if (-1 == r) return NULL;
kh_val(h->preservation_map, k).i = 0;
k = kh_put(map, h->preservation_map, "UI", &r);
if (-1 == r) return NULL;
kh_val(h->preservation_map, k).i = 1;
k = kh_put(map, h->preservation_map, "MI", &r);
if (-1 == r) return NULL;
kh_val(h->preservation_map, k).i = 1;
} else {
k = kh_put(map, h->preservation_map, "SM", &r);
if (-1 == r) return NULL;
kh_val(h->preservation_map, k).i = 0;
k = kh_put(map, h->preservation_map, "TD", &r);
if (-1 == r) return NULL;
kh_val(h->preservation_map, k).i = 0;
k = kh_put(map, h->preservation_map, "AP", &r);
if (-1 == r) return NULL;
kh_val(h->preservation_map, k).i = h->AP_delta;
if (CRAM_MAJOR_VERS(fd->version) >= 4) {
k = kh_put(map, h->preservation_map, "QO", &r);
if (-1 == r) return NULL;
kh_val(h->preservation_map, k).i = h->qs_seq_orient;
}
if (no_ref || embed_ref>0) {
k = kh_put(map, h->preservation_map, "RR", &r);
if (-1 == r) return NULL;
kh_val(h->preservation_map, k).i = 0;
}
}
}
mc = 0;
BLOCK_SIZE(map) = 0;
if (h->preservation_map) {
khint_t k;
for (k = kh_begin(h->preservation_map);
k != kh_end(h->preservation_map);
k++) {
const char *key;
khash_t(map) *pmap = h->preservation_map;
if (!kh_exist(pmap, k))
continue;
key = kh_key(pmap, k);
BLOCK_APPEND(map, key, 2);
switch(CRAM_KEY(key[0], key[1])) {
case CRAM_KEY('M','I'):
case CRAM_KEY('U','I'):
case CRAM_KEY('P','I'):
case CRAM_KEY('A','P'):
case CRAM_KEY('R','N'):
case CRAM_KEY('R','R'):
case CRAM_KEY('Q','O'):
BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
break;
case CRAM_KEY('S','M'): {
char smat[5], *mp = smat;
*mp++ =
(sub_idx(h->substitution_matrix[0], 'C') << 6) |
(sub_idx(h->substitution_matrix[0], 'G') << 4) |
(sub_idx(h->substitution_matrix[0], 'T') << 2) |
(sub_idx(h->substitution_matrix[0], 'N') << 0);
*mp++ =
(sub_idx(h->substitution_matrix[1], 'A') << 6) |
(sub_idx(h->substitution_matrix[1], 'G') << 4) |
(sub_idx(h->substitution_matrix[1], 'T') << 2) |
(sub_idx(h->substitution_matrix[1], 'N') << 0);
*mp++ =
(sub_idx(h->substitution_matrix[2], 'A') << 6) |
(sub_idx(h->substitution_matrix[2], 'C') << 4) |
(sub_idx(h->substitution_matrix[2], 'T') << 2) |
(sub_idx(h->substitution_matrix[2], 'N') << 0);
*mp++ =
(sub_idx(h->substitution_matrix[3], 'A') << 6) |
(sub_idx(h->substitution_matrix[3], 'C') << 4) |
(sub_idx(h->substitution_matrix[3], 'G') << 2) |
(sub_idx(h->substitution_matrix[3], 'N') << 0);
*mp++ =
(sub_idx(h->substitution_matrix[4], 'A') << 6) |
(sub_idx(h->substitution_matrix[4], 'C') << 4) |
(sub_idx(h->substitution_matrix[4], 'G') << 2) |
(sub_idx(h->substitution_matrix[4], 'T') << 0);
BLOCK_APPEND(map, smat, 5);
break;
}
case CRAM_KEY('T','D'): {
r |= (fd->vv.varint_put32_blk(map, BLOCK_SIZE(h->TD_blk)) <= 0);
BLOCK_APPEND(map,
BLOCK_DATA(h->TD_blk),
BLOCK_SIZE(h->TD_blk));
break;
}
default:
hts_log_warning("Unknown preservation key '%.2s'", key);
break;
}
mc++;
}
}
r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0);
r |= (fd->vv.varint_put32_blk(cb, mc) <= 0);
BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
mc = 0;
BLOCK_SIZE(map) = 0;
if (h->codecs[DS_BF]) {
if (-1 == h->codecs[DS_BF]->store(h->codecs[DS_BF], map, "BF",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_CF]) {
if (-1 == h->codecs[DS_CF]->store(h->codecs[DS_CF], map, "CF",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_RL]) {
if (-1 == h->codecs[DS_RL]->store(h->codecs[DS_RL], map, "RL",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_AP]) {
if (-1 == h->codecs[DS_AP]->store(h->codecs[DS_AP], map, "AP",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_RG]) {
if (-1 == h->codecs[DS_RG]->store(h->codecs[DS_RG], map, "RG",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_MF]) {
if (-1 == h->codecs[DS_MF]->store(h->codecs[DS_MF], map, "MF",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_NS]) {
if (-1 == h->codecs[DS_NS]->store(h->codecs[DS_NS], map, "NS",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_NP]) {
if (-1 == h->codecs[DS_NP]->store(h->codecs[DS_NP], map, "NP",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_TS]) {
if (-1 == h->codecs[DS_TS]->store(h->codecs[DS_TS], map, "TS",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_NF]) {
if (-1 == h->codecs[DS_NF]->store(h->codecs[DS_NF], map, "NF",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_TC]) {
if (-1 == h->codecs[DS_TC]->store(h->codecs[DS_TC], map, "TC",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_TN]) {
if (-1 == h->codecs[DS_TN]->store(h->codecs[DS_TN], map, "TN",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_TL]) {
if (-1 == h->codecs[DS_TL]->store(h->codecs[DS_TL], map, "TL",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_FN]) {
if (-1 == h->codecs[DS_FN]->store(h->codecs[DS_FN], map, "FN",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_FC]) {
if (-1 == h->codecs[DS_FC]->store(h->codecs[DS_FC], map, "FC",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_FP]) {
if (-1 == h->codecs[DS_FP]->store(h->codecs[DS_FP], map, "FP",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_BS]) {
if (-1 == h->codecs[DS_BS]->store(h->codecs[DS_BS], map, "BS",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_IN]) {
if (-1 == h->codecs[DS_IN]->store(h->codecs[DS_IN], map, "IN",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_DL]) {
if (-1 == h->codecs[DS_DL]->store(h->codecs[DS_DL], map, "DL",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_BA]) {
if (-1 == h->codecs[DS_BA]->store(h->codecs[DS_BA], map, "BA",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_BB]) {
if (-1 == h->codecs[DS_BB]->store(h->codecs[DS_BB], map, "BB",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_MQ]) {
if (-1 == h->codecs[DS_MQ]->store(h->codecs[DS_MQ], map, "MQ",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_RN]) {
if (-1 == h->codecs[DS_RN]->store(h->codecs[DS_RN], map, "RN",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_QS]) {
if (-1 == h->codecs[DS_QS]->store(h->codecs[DS_QS], map, "QS",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_QQ]) {
if (-1 == h->codecs[DS_QQ]->store(h->codecs[DS_QQ], map, "QQ",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_RI]) {
if (-1 == h->codecs[DS_RI]->store(h->codecs[DS_RI], map, "RI",
fd->version))
return NULL;
mc++;
}
if (CRAM_MAJOR_VERS(fd->version) != 1) {
if (h->codecs[DS_SC]) {
if (-1 == h->codecs[DS_SC]->store(h->codecs[DS_SC], map, "SC",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_RS]) {
if (-1 == h->codecs[DS_RS]->store(h->codecs[DS_RS], map, "RS",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_PD]) {
if (-1 == h->codecs[DS_PD]->store(h->codecs[DS_PD], map, "PD",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_HC]) {
if (-1 == h->codecs[DS_HC]->store(h->codecs[DS_HC], map, "HC",
fd->version))
return NULL;
mc++;
}
}
if (h->codecs[DS_TM]) {
if (-1 == h->codecs[DS_TM]->store(h->codecs[DS_TM], map, "TM",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_TV]) {
if (-1 == h->codecs[DS_TV]->store(h->codecs[DS_TV], map, "TV",
fd->version))
return NULL;
mc++;
}
r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0);
r |= (fd->vv.varint_put32_blk(cb, mc) <= 0);
BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
mc = 0;
BLOCK_SIZE(map) = 0;
if (c->tags_used) {
khint_t k;
for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
int key;
if (!kh_exist(c->tags_used, k))
continue;
key = kh_key(c->tags_used, k);
cram_codec *cd = kh_val(c->tags_used, k)->codec;
r |= (fd->vv.varint_put32_blk(map, key) <= 0);
if (-1 == cd->store(cd, map, NULL, fd->version))
return NULL;
mc++;
}
}
r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0);
r |= (fd->vv.varint_put32_blk(cb, mc) <= 0);
BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
hts_log_info("Wrote compression block header in %d bytes", (int)BLOCK_SIZE(cb));
BLOCK_UPLEN(cb);
cram_free_block(map);
if (r >= 0)
return cb;
block_err:
return NULL;
}
cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) {
char *buf;
char *cp;
cram_block *b = cram_new_block(MAPPED_SLICE, 0);
int j;
if (!b)
return NULL;
cp = buf = malloc(22+16+5*(8+s->hdr->num_blocks));
if (NULL == buf) {
cram_free_block(b);
return NULL;
}
cp += fd->vv.varint_put32s(cp, NULL, s->hdr->ref_seq_id);
if (CRAM_MAJOR_VERS(fd->version) >= 4) {
cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_start);
cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_span);
} else {
if (s->hdr->ref_seq_start < 0 || s->hdr->ref_seq_start > INT_MAX) {
hts_log_error("Reference position too large for CRAM 3");
cram_free_block(b);
free(buf);
return NULL;
}
cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_start);
cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_span);
}
cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_records);
if (CRAM_MAJOR_VERS(fd->version) == 2)
cp += fd->vv.varint_put32(cp, NULL, s->hdr->record_counter);
else if (CRAM_MAJOR_VERS(fd->version) >= 3)
cp += fd->vv.varint_put64(cp, NULL, s->hdr->record_counter);
cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_blocks);
cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_content_ids);
for (j = 0; j < s->hdr->num_content_ids; j++) {
cp += fd->vv.varint_put32(cp, NULL, s->hdr->block_content_ids[j]);
}
if (s->hdr->content_type == MAPPED_SLICE)
cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_base_id);
if (CRAM_MAJOR_VERS(fd->version) != 1) {
memcpy(cp, s->hdr->md5, 16); cp += 16;
}
assert(cp-buf <= 22+16+5*(8+s->hdr->num_blocks));
b->data = (unsigned char *)buf;
b->comp_size = b->uncomp_size = cp-buf;
return b;
}
static int cram_encode_slice_read(cram_fd *fd,
cram_container *c,
cram_block_compression_hdr *h,
cram_slice *s,
cram_record *cr,
int64_t *last_pos) {
int r = 0;
int32_t i32;
int64_t i64;
unsigned char uc;
i32 = fd->cram_flag_swap[cr->flags & 0xfff];
r |= h->codecs[DS_BF]->encode(s, h->codecs[DS_BF], (char *)&i32, 1);
i32 = cr->cram_flags & CRAM_FLAG_MASK;
r |= h->codecs[DS_CF]->encode(s, h->codecs[DS_CF], (char *)&i32, 1);
if (CRAM_MAJOR_VERS(fd->version) != 1 && s->hdr->ref_seq_id == -2)
r |= h->codecs[DS_RI]->encode(s, h->codecs[DS_RI], (char *)&cr->ref_id, 1);
r |= h->codecs[DS_RL]->encode(s, h->codecs[DS_RL], (char *)&cr->len, 1);
if (c->pos_sorted) {
if (CRAM_MAJOR_VERS(fd->version) >= 4) {
i64 = cr->apos - *last_pos;
r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1);
} else {
i32 = cr->apos - *last_pos;
r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
}
*last_pos = cr->apos;
} else {
if (CRAM_MAJOR_VERS(fd->version) >= 4) {
i64 = cr->apos;
r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1);
} else {
i32 = cr->apos;
r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
}
}
r |= h->codecs[DS_RG]->encode(s, h->codecs[DS_RG], (char *)&cr->rg, 1);
if (cr->cram_flags & CRAM_FLAG_DETACHED) {
i32 = cr->mate_flags;
r |= h->codecs[DS_MF]->encode(s, h->codecs[DS_MF], (char *)&i32, 1);
r |= h->codecs[DS_NS]->encode(s, h->codecs[DS_NS],
(char *)&cr->mate_ref_id, 1);
if (CRAM_MAJOR_VERS(fd->version) >= 4) {
r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP],
(char *)&cr->mate_pos, 1);
r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
(char *)&cr->tlen, 1);
} else {
i32 = cr->mate_pos;
r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP],
(char *)&i32, 1);
i32 = cr->tlen;
r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
(char *)&i32, 1);
}
} else {
if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) {
r |= h->codecs[DS_NF]->encode(s, h->codecs[DS_NF],
(char *)&cr->mate_line, 1);
}
if (cr->cram_flags & CRAM_FLAG_EXPLICIT_TLEN) {
if (CRAM_MAJOR_VERS(fd->version) >= 4) {
r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
(char *)&cr->tlen, 1);
}
}
}
if (CRAM_MAJOR_VERS(fd->version) == 1) {
int j;
uc = cr->ntags;
r |= h->codecs[DS_TC]->encode(s, h->codecs[DS_TC], (char *)&uc, 1);
for (j = 0; j < cr->ntags; j++) {
uint32_t i32 = s->TN[cr->TN_idx + j]; r |= h->codecs[DS_TN]->encode(s, h->codecs[DS_TN], (char *)&i32, 1);
}
} else {
r |= h->codecs[DS_TL]->encode(s, h->codecs[DS_TL], (char *)&cr->TL, 1);
}
if (!(cr->flags & BAM_FUNMAP)) {
int prev_pos = 0, j;
r |= h->codecs[DS_FN]->encode(s, h->codecs[DS_FN],
(char *)&cr->nfeature, 1);
for (j = 0; j < cr->nfeature; j++) {
cram_feature *f = &s->features[cr->feature + j];
uc = f->X.code;
r |= h->codecs[DS_FC]->encode(s, h->codecs[DS_FC], (char *)&uc, 1);
i32 = f->X.pos - prev_pos;
r |= h->codecs[DS_FP]->encode(s, h->codecs[DS_FP], (char *)&i32, 1);
prev_pos = f->X.pos;
switch(f->X.code) {
case 'X':
uc = f->X.base;
r |= h->codecs[DS_BS]->encode(s, h->codecs[DS_BS],
(char *)&uc, 1);
break;
case 'S':
break;
case 'I':
break;
case 'i':
uc = f->i.base;
r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
(char *)&uc, 1);
break;
case 'D':
i32 = f->D.len;
r |= h->codecs[DS_DL]->encode(s, h->codecs[DS_DL],
(char *)&i32, 1);
break;
case 'B':
uc = f->B.base;
r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
(char *)&uc, 1);
break;
case 'b':
r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
(char *)BLOCK_DATA(s->seqs_blk)
+ f->b.seq_idx,
f->b.len);
break;
case 'Q':
break;
case 'N':
i32 = f->N.len;
r |= h->codecs[DS_RS]->encode(s, h->codecs[DS_RS],
(char *)&i32, 1);
break;
case 'P':
i32 = f->P.len;
r |= h->codecs[DS_PD]->encode(s, h->codecs[DS_PD],
(char *)&i32, 1);
break;
case 'H':
i32 = f->H.len;
r |= h->codecs[DS_HC]->encode(s, h->codecs[DS_HC],
(char *)&i32, 1);
break;
default:
hts_log_error("Unhandled feature code %c", f->X.code);
return -1;
}
}
r |= h->codecs[DS_MQ]->encode(s, h->codecs[DS_MQ],
(char *)&cr->mqual, 1);
} else {
char *seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
if (cr->len)
r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA], seq, cr->len);
}
return r ? -1 : 0;
}
static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) {
int level = fd->level, i;
int method = 1<<GZIP | 1<<GZIP_RLE, methodF = method;
int v31_or_above = (fd->version >= (3<<8)+1);
if (level > 5 && s->block[0]->uncomp_size > 500)
cram_compress_block2(fd, s, s->block[0], NULL, 1<<GZIP, 1);
if (fd->use_bz2)
method |= 1<<BZIP2;
int method_rans = (1<<RANS0) | (1<<RANS1);
int method_ranspr = method_rans;
if (fd->use_rans) {
method_ranspr = (1<<RANS_PR0) | (1<<RANS_PR1);
if (level > 1)
method_ranspr |=
(1<<RANS_PR64) | (1<<RANS_PR9)
| (1<<RANS_PR128) | (1<<RANS_PR193);
if (level > 5)
method_ranspr |= (1<<RANS_PR129) | (1<<RANS_PR192);
}
if (fd->use_rans) {
methodF |= v31_or_above ? method_ranspr : method_rans;
method |= v31_or_above ? method_ranspr : method_rans;
}
int method_arith = 0;
if (fd->use_arith) {
method_arith = (1<<ARITH_PR0) | (1<<ARITH_PR1);
if (level > 1)
method_arith |=
(1<<ARITH_PR64) | (1<<ARITH_PR9)
| (1<<ARITH_PR128) | (1<<ARITH_PR129)
| (1<<ARITH_PR192) | (1u<<ARITH_PR193);
}
if (fd->use_arith && v31_or_above) {
methodF |= method_arith;
method |= method_arith;
}
if (fd->use_lzma)
method |= (1<<LZMA);
methodF = method & ~(1<<GZIP | 1<<BZIP2 | 1<<LZMA);
if (level >= 5) {
method |= 1<<GZIP_1;
methodF = method;
}
if (level == 1) {
method &= ~(1<<GZIP);
method |= 1<<GZIP_1;
methodF = method;
}
int qmethod = method;
int qmethodF = method;
if (v31_or_above && fd->use_fqz) {
qmethod |= 1<<FQZ;
qmethodF |= 1<<FQZ;
if (fd->level > 4) {
qmethod |= 1<<FQZ_b;
qmethodF |= 1<<FQZ_b;
}
if (fd->level > 6) {
qmethod |= (1<<FQZ_c) | (1<<FQZ_d);
qmethodF |= (1<<FQZ_c) | (1<<FQZ_d);
}
}
pthread_mutex_lock(&fd->metrics_lock);
for (i = 0; i < DS_END; i++)
if (c->stats[i] && c->stats[i]->nvals > 16)
fd->m[i]->unpackable = 1;
pthread_mutex_unlock(&fd->metrics_lock);
if (cram_compress_block2(fd, s, s->block[DS_IN], fd->m[DS_IN], method, level))
return -1;
if (fd->level == 0) {
} else if (fd->level == 1) {
if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS],
qmethodF, 1))
return -1;
for (i = DS_aux; i <= DS_aux_oz; i++) {
if (s->block[i])
if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
method, 1))
return -1;
}
} else if (fd->level < 3) {
if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS],
qmethod, 1))
return -1;
if (cram_compress_block2(fd, s, s->block[DS_BA], fd->m[DS_BA],
method, 1))
return -1;
if (s->block[DS_BB])
if (cram_compress_block2(fd, s, s->block[DS_BB], fd->m[DS_BB],
method, 1))
return -1;
for (i = DS_aux; i <= DS_aux_oz; i++) {
if (s->block[i])
if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
method, level))
return -1;
}
} else {
if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS],
qmethod, level))
return -1;
if (cram_compress_block2(fd, s, s->block[DS_BA], fd->m[DS_BA],
method, level))
return -1;
if (s->block[DS_BB])
if (cram_compress_block2(fd, s, s->block[DS_BB], fd->m[DS_BB],
method, level))
return -1;
for (i = DS_aux; i <= DS_aux_oz; i++) {
if (s->block[i])
if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
method, level))
return -1;
}
}
int method_rn = method & ~(method_rans | method_ranspr | 1<<GZIP_RLE);
if (fd->version >= (3<<8)+1 && fd->use_tok)
method_rn |= fd->use_arith ? (1<<TOKA) : (1<<TOK3);
if (cram_compress_block2(fd, s, s->block[DS_RN], fd->m[DS_RN],
method_rn, level))
return -1;
if (s->block[DS_NS] && s->block[DS_NS] != s->block[0])
if (cram_compress_block2(fd, s, s->block[DS_NS], fd->m[DS_NS],
method, level))
return -1;
{
int i;
for (i = DS_END ; i < s->hdr->num_blocks; i++) {
if (!s->block[i] || s->block[i] == s->block[0])
continue;
if (s->block[i]->method != RAW)
continue;
if (cram_compress_block2(fd, s, s->block[i], s->block[i]->m,
method, level))
return -1;
}
}
{
int i;
for (i = 1; i < s->hdr->num_blocks && i < DS_END; i++) {
if (!s->block[i] || s->block[i] == s->block[0])
continue;
if (s->block[i]->method != RAW)
continue;
if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
methodF, level))
return -1;
}
}
return 0;
}
static int cram_allocate_block(cram_codec *codec, cram_slice *s, int ds_id) {
if (!codec)
return 0;
switch(codec->codec) {
case E_GOLOMB:
case E_HUFFMAN:
case E_BETA:
case E_SUBEXP:
case E_GOLOMB_RICE:
case E_GAMMA:
codec->out = s->block[0];
break;
case E_CONST_BYTE:
case E_CONST_INT:
codec->out = NULL;
break;
case E_EXTERNAL:
case E_VARINT_UNSIGNED:
case E_VARINT_SIGNED:
if (!(s->block[ds_id] = cram_new_block(EXTERNAL, ds_id)))
return -1;
codec->u.external.content_id = ds_id;
codec->out = s->block[ds_id];
break;
case E_BYTE_ARRAY_STOP: if (!(s->block[ds_id] = cram_new_block(EXTERNAL, ds_id)))
return -1;
codec->u.byte_array_stop.content_id = ds_id;
codec->out = s->block[ds_id];
break;
case E_BYTE_ARRAY_LEN: {
cram_codec *bal = codec->u.e_byte_array_len.len_codec;
if (cram_allocate_block(bal, s, bal->u.external.content_id))
return -1;
bal = codec->u.e_byte_array_len.val_codec;
if (cram_allocate_block(bal, s, bal->u.external.content_id))
return -1;
break;
}
case E_XRLE:
if (cram_allocate_block(codec->u.e_xrle.len_codec, s, ds_id))
return -1;
if (cram_allocate_block(codec->u.e_xrle.lit_codec, s, ds_id))
return -1;
break;
case E_XPACK:
if (cram_allocate_block(codec->u.e_xpack.sub_codec, s, ds_id))
return -1;
codec->out = cram_new_block(0, 0); if (!codec->out)
return -1;
break;
case E_XDELTA:
if (cram_allocate_block(codec->u.e_xdelta.sub_codec, s, ds_id))
return -1;
codec->out = cram_new_block(0, 0); if (!codec->out)
return -1;
break;
default:
break;
}
return 0;
}
static int cram_encode_slice(cram_fd *fd, cram_container *c,
cram_block_compression_hdr *h, cram_slice *s,
int embed_ref) {
int rec, r = 0;
int64_t last_pos;
enum cram_DS_ID id;
s->hdr->ref_base_id = embed_ref>0 && s->hdr->ref_seq_span > 0
? DS_ref
: (CRAM_MAJOR_VERS(fd->version) >= 4 ? 0 : -1);
s->hdr->record_counter = c->num_records + c->record_counter;
c->num_records += s->hdr->num_records;
int ntags = c->tags_used ? c->tags_used->n_occupied : 0;
s->block = calloc(DS_END + ntags*2, sizeof(s->block[0]));
s->hdr->block_content_ids = malloc(DS_END * sizeof(int32_t));
if (!s->block || !s->hdr->block_content_ids)
return -1;
if (!(s->block[0] = cram_new_block(CORE, 0)))
return -1;
if (CRAM_MAJOR_VERS(fd->version) == 1) {
if (h->codecs[DS_TN]->codec == E_EXTERNAL) {
if (!(s->block[DS_TN] = cram_new_block(EXTERNAL,DS_TN))) return -1;
h->codecs[DS_TN]->u.external.content_id = DS_TN;
} else {
s->block[DS_TN] = s->block[0];
}
}
if (embed_ref>0) {
if (!(s->block[DS_ref] = cram_new_block(EXTERNAL, DS_ref)))
return -1;
s->ref_id = DS_ref; BLOCK_APPEND(s->block[DS_ref],
c->ref + s->hdr->ref_seq_start - c->ref_start,
s->hdr->ref_seq_span);
}
for (id = DS_QS; id < DS_TN; id++) {
if (cram_allocate_block(h->codecs[id], s, id) < 0)
return -1;
}
if (c->tags_used) {
int n;
s->hdr->num_blocks = DS_END;
for (n = 0; n < s->naux_block; n++) {
s->block[s->hdr->num_blocks++] = s->aux_block[n];
s->aux_block[n] = NULL;
}
}
last_pos = s->hdr->ref_seq_start;
for (rec = 0; rec < s->hdr->num_records; rec++) {
cram_record *cr = &s->crecs[rec];
if (cram_encode_slice_read(fd, c, h, s, cr, &last_pos) == -1)
return -1;
}
s->block[0]->uncomp_size = s->block[0]->byte + (s->block[0]->bit < 7);
s->block[0]->comp_size = s->block[0]->uncomp_size;
if (s->block[DS_IN]) cram_free_block(s->block[DS_IN]);
s->block[DS_IN] = s->base_blk; s->base_blk = NULL;
if (s->block[DS_QS]) cram_free_block(s->block[DS_QS]);
s->block[DS_QS] = s->qual_blk; s->qual_blk = NULL;
if (s->block[DS_RN]) cram_free_block(s->block[DS_RN]);
s->block[DS_RN] = s->name_blk; s->name_blk = NULL;
if (s->block[DS_SC]) cram_free_block(s->block[DS_SC]);
s->block[DS_SC] = s->soft_blk; s->soft_blk = NULL;
for (id = DS_QS; id < DS_TN; id++) {
if (h->codecs[id] && h->codecs[id]->flush)
h->codecs[id]->flush(h->codecs[id]);
}
for (id = 1; id < s->hdr->num_blocks; id++) {
if (!s->block[id] || s->block[id] == s->block[0])
continue;
if (s->block[id]->uncomp_size == 0)
BLOCK_UPLEN(s->block[id]);
}
if (cram_compress_slice(fd, c, s) == -1)
return -1;
{
int i, j;
s->hdr->block_content_ids = realloc(s->hdr->block_content_ids,
s->hdr->num_blocks * sizeof(int32_t));
if (!s->hdr->block_content_ids)
return -1;
for (i = j = 1; i < s->hdr->num_blocks; i++) {
if (!s->block[i] || s->block[i] == s->block[0])
continue;
if (s->block[i]->uncomp_size == 0) {
cram_free_block(s->block[i]);
s->block[i] = NULL;
continue;
}
s->block[j] = s->block[i];
s->hdr->block_content_ids[j-1] = s->block[i]->content_id;
j++;
}
s->hdr->num_content_ids = j-1;
s->hdr->num_blocks = j;
if (!(s->hdr_block = cram_encode_slice_header(fd, s)))
return -1;
}
return r ? -1 : 0;
block_err:
return -1;
}
static inline const char *bam_data_end(bam1_t *b) {
return (const char *)b->data + b->l_data;
}
static inline int bam_aux2i_end(const uint8_t *aux, const uint8_t *aux_end) {
int type = *aux++;
switch (type) {
case 'c':
if (aux_end - aux < 1) {
errno = EINVAL;
return 0;
}
return *(int8_t *)aux;
case 'C':
if (aux_end - aux < 1) {
errno = EINVAL;
return 0;
}
return *aux;
case 's':
if (aux_end - aux < 2) {
errno = EINVAL;
return 0;
}
return le_to_i16(aux);
case 'S':
if (aux_end - aux < 2) {
errno = EINVAL;
return 0;
}
return le_to_u16(aux);
case 'i':
if (aux_end - aux < 4) {
errno = EINVAL;
return 0;
}
return le_to_i32(aux);
case 'I':
if (aux_end - aux < 4) {
errno = EINVAL;
return 0;
}
return le_to_u32(aux);
default:
errno = EINVAL;
}
return 0;
}
static int expected_template_count(bam_seq_t *b) {
int expected = bam_flag(b) & BAM_FPAIRED ? 2 : 1;
uint8_t *TC = (uint8_t *)bam_aux_get(b, "TC");
if (TC) {
int n = bam_aux2i_end(TC, (uint8_t *)bam_data_end(b));
if (expected < n)
expected = n;
}
if (!TC && bam_aux_get(b, "SA")) {
expected = INT_MAX;
}
return expected;
}
static int lossy_read_names(cram_fd *fd, cram_container *c, cram_slice *s,
int bam_start) {
int r1, r2, ret = -1;
for (r2 = 0; r2 < s->hdr->num_records; r2++)
s->crecs[r2].cram_flags = 0;
if (!fd->lossy_read_names)
return 0;
khash_t(m_s2u64) *names = kh_init(m_s2u64);
if (!names)
goto fail;
for (r1 = bam_start, r2 = 0; r2 < s->hdr->num_records; r1++, r2++) {
bam_seq_t *b = c->bams[r1];
khint_t k;
int n;
uint64_t e;
union {
uint64_t i64;
struct {
int32_t e,c; } counts;
} u;
e = expected_template_count(b);
u.counts.e = e; u.counts.c = 1;
k = kh_put(m_s2u64, names, bam_name(b), &n);
if (n == -1)
goto fail;
if (n == 0) {
u.i64 = kh_val(names, k);
if (u.counts.e != e) {
kh_val(names, k) = 0;
} else {
u.counts.c++;
if (u.counts.e == u.counts.c) {
kh_val(names, k) = -1;
} else {
kh_val(names, k) = u.i64;
}
}
} else {
kh_val(names, k) = u.i64;
}
}
for (r1 = bam_start, r2 = 0; r2 < s->hdr->num_records; r1++, r2++) {
cram_record *cr = &s->crecs[r2];
bam_seq_t *b = c->bams[r1];
khint_t k;
k = kh_get(m_s2u64, names, bam_name(b));
if (k == kh_end(names))
goto fail;
if (kh_val(names, k) == -1)
cr->cram_flags = CRAM_FLAG_DISCARD_NAME;
}
ret = 0;
fail:
if (names)
kh_destroy(m_s2u64, names);
return ret;
}
static int add_read_names(cram_fd *fd, cram_container *c, cram_slice *s,
int bam_start) {
int r1, r2;
int keep_names = !fd->lossy_read_names;
for (r1 = bam_start, r2 = 0;
r1 < c->curr_c_rec && r2 < s->hdr->num_records;
r1++, r2++) {
cram_record *cr = &s->crecs[r2];
bam_seq_t *b = c->bams[r1];
cr->name = BLOCK_SIZE(s->name_blk);
if ((cr->cram_flags & CRAM_FLAG_DETACHED) || keep_names) {
if (CRAM_MAJOR_VERS(fd->version) >= 4
&& (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM)
&& cr->mate_line) {
BLOCK_APPEND(s->name_blk, "\0", 1);
cr->name_len = 1;
} else {
BLOCK_APPEND(s->name_blk, bam_name(b), bam_name_len(b));
cr->name_len = bam_name_len(b);
}
} else {
cr->name_len = 0;
}
if (cram_stats_add(c->stats[DS_RN], cr->name_len) < 0)
goto block_err;
}
return 0;
block_err:
return -1;
}
#define CRAM_ge31(v) ((v) >= 0x301)
static inline
int next_cigar_op(uint32_t *cigar, uint32_t ncigar, int *skip, int *spos,
uint32_t *cig_ind, uint32_t *cig_op, uint32_t *cig_len) {
for(;;) {
while (*cig_len == 0) {
if (*cig_ind < ncigar) {
*cig_op = cigar[*cig_ind] & BAM_CIGAR_MASK;
*cig_len = cigar[*cig_ind] >> BAM_CIGAR_SHIFT;
(*cig_ind)++;
} else {
return -1;
}
}
if (skip[*cig_op]) {
*spos += (bam_cigar_type(*cig_op)&1) * *cig_len;
*cig_len = 0;
continue;
}
(*cig_len)--;
break;
}
return *cig_op;
}
static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos,
hts_pos_t ref_start, hts_pos_t *ref_end) {
if (pos < ref_start)
return -1;
if (pos < *ref_end)
return 0;
if (pos - ref_start > UINT_MAX)
return -2;
hts_pos_t old_end = *ref_end ? *ref_end : ref_start;
hts_pos_t new_end = ref_start + 1000 + (pos-ref_start)*1.5;
if (new_end - ref_start > UINT_MAX/sizeof(**hist)/2)
return -2;
char *tmp = realloc(*ref, new_end-ref_start+1);
if (!tmp)
return -1;
*ref = tmp;
uint32_t (*tmp5)[5] = realloc(**hist,
(new_end - ref_start)*sizeof(**hist));
if (!tmp5)
return -1;
*hist = tmp5;
*ref_end = new_end;
old_end -= ref_start;
new_end -= ref_start;
memset(&(*ref)[old_end], 0, new_end-old_end);
memset(&(*hist)[old_end], 0, (new_end-old_end)*sizeof(**hist));
return 0;
}
static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5],
hts_pos_t ref_start, hts_pos_t *ref_end,
const uint8_t *MD) {
uint8_t *seq = bam_get_seq(b);
uint32_t *cigar = bam_get_cigar(b);
uint32_t ncigar = b->core.n_cigar;
uint32_t cig_op = 0, cig_len = 0, cig_ind = 0;
int iseq = 0, next_op;
hts_pos_t iref = b->core.pos - ref_start;
static int cig_skip[16] = {0,1,0,1,1,1,1,0,0,1,1,1,1,1,1,1};
while (iseq < b->core.l_qseq && *MD) {
if (isdigit(*MD)) {
int overflow = 0;
int len = hts_str2uint((char *)MD, (char **)&MD, 31, &overflow);
if (overflow ||
extend_ref(ref, hist, iref+ref_start + len,
ref_start, ref_end) < 0)
return -1;
while (iseq < b->core.l_qseq && len) {
if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
&iseq, &cig_ind, &cig_op,
&cig_len)) < 0)
return -1;
if (next_op != BAM_CMATCH &&
next_op != BAM_CEQUAL) {
hts_log_info("MD:Z and CIGAR are incompatible for "
"record %s", bam_get_qname(b));
return -1;
}
cig_len++;
do {
cig_len--;
(*ref)[iref++] = seq_nt16_str[bam_seqi(seq, iseq)];
iseq++;
len--;
} while (cig_len && iseq < b->core.l_qseq && len);
}
if (len > 0)
return -1; } else if (*MD == '^') {
MD++;
while (isalpha(*MD)) {
if (extend_ref(ref, hist, iref+ref_start, ref_start,
ref_end) < 0)
return -1;
if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
&iseq, &cig_ind, &cig_op,
&cig_len)) < 0)
return -1;
if (next_op != BAM_CDEL) {
hts_log_info("MD:Z and CIGAR are incompatible");
return -1;
}
(*ref)[iref++] = *MD++ & ~0x20;
}
} else {
if (extend_ref(ref, hist, iref+ref_start, ref_start, ref_end) < 0)
return -1;
if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
&iseq, &cig_ind, &cig_op,
&cig_len)) < 0)
return -1;
if (next_op != BAM_CMATCH && next_op != BAM_CDIFF) {
hts_log_info("MD:Z and CIGAR are incompatible");
return -1;
}
(*ref)[iref++] = *MD++ & ~0x20;
iseq++;
}
}
return 1;
}
static int cram_add_to_ref(bam1_t *b, char **ref, uint32_t (**hist)[5],
hts_pos_t ref_start, hts_pos_t *ref_end) {
const uint8_t *MD = bam_aux_get(b, "MD");
int ret = 0;
if (MD && *MD == 'Z') {
int ret = cram_add_to_ref_MD(b, ref, hist, ref_start, ref_end, MD+1);
if (ret > 0)
return ret;
}
uint32_t *cigar = bam_get_cigar(b);
uint32_t ncigar = b->core.n_cigar;
uint32_t i, j;
hts_pos_t iseq = 0, iref = b->core.pos - ref_start;
uint8_t *seq = bam_get_seq(b);
for (i = 0; i < ncigar; i++) {
switch (bam_cigar_op(cigar[i])) {
case BAM_CSOFT_CLIP:
case BAM_CINS:
iseq += bam_cigar_oplen(cigar[i]);
break;
case BAM_CMATCH:
case BAM_CEQUAL:
case BAM_CDIFF: {
int len = bam_cigar_oplen(cigar[i]);
static uint8_t L16[16] = {4,0,1,4, 2,4,4,4, 3,4,4,4, 4,4,4,4};
if (extend_ref(ref, hist, iref+ref_start + len,
ref_start, ref_end) < 0)
return -1;
if (iseq + len <= b->core.l_qseq) {
if (ret < 0)
memset(&(*ref)[iref], 0, len);
for (j = 0; j < len; j++, iref++, iseq++)
(*hist)[iref][L16[bam_seqi(seq, iseq)]]++;
} else {
iseq += len;
iref += len;
}
break;
}
case BAM_CDEL:
case BAM_CREF_SKIP:
iref += bam_cigar_oplen(cigar[i]);
}
}
return 1;
}
static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) {
char *ref = NULL;
uint32_t (*hist)[5] = NULL;
hts_pos_t ref_start = c->bams[r1]->core.pos, ref_end = 0;
if (ref_start < 0)
return -1;
if (extend_ref(&ref, &hist,
c->bams[r1 + s->hdr->num_records-1]->core.pos +
c->bams[r1 + s->hdr->num_records-1]->core.l_qseq,
ref_start, &ref_end) < 0)
return -1;
int r2;
hts_pos_t last_pos = -1;
for (r2 = 0; r1 < c->curr_c_rec && r2 < s->hdr->num_records; r1++, r2++) {
if (c->bams[r1]->core.pos < last_pos) {
hts_log_error("Cannot build reference with unsorted data");
goto err;
}
last_pos = c->bams[r1]->core.pos;
if (cram_add_to_ref(c->bams[r1], &ref, &hist, ref_start, &ref_end) < 0)
goto err;
}
hts_pos_t i;
for (i = 0; i < ref_end-ref_start; i++) {
if (!ref[i]) {
int max_v = 0, max_j = 4, j;
for (j = 0; j < 4; j++)
if (max_v < hist[i][j])
max_v = hist[i][j], max_j = j;
ref[i] = "ACGTN"[max_j];
}
}
free(hist);
c->ref = ref;
c->ref_start = ref_start+1;
c->ref_end = ref_end+1;
c->ref_free = 1;
return 0;
err:
free(ref);
free(hist);
return -1;
}
static int validate_md5(cram_fd *fd, int ref_id) {
if (fd->ignore_md5 || ref_id < 0 || ref_id >= fd->refs->nref)
return 0;
if (fd->refs->ref_id[ref_id]->validated_md5)
return 0;
sam_hrecs_t *hrecs = fd->header->hrecs;
sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, "SQ", "SN",
hrecs->ref[ref_id].name);
if (!ty)
return 0;
sam_hrec_tag_t *m5tag = sam_hrecs_find_key(ty, "M5", NULL);
if (!m5tag)
return 0;
char *ref = fd->refs->ref_id[ref_id]->seq;
int64_t len = fd->refs->ref_id[ref_id]->length;
hts_md5_context *md5;
char unsigned buf[16];
char buf2[33];
if (!(md5 = hts_md5_init()))
return -1;
hts_md5_update(md5, ref, len);
hts_md5_final(buf, md5);
hts_md5_destroy(md5);
hts_md5_hex(buf2, buf);
if (strcmp(m5tag->str+3, buf2)) {
hts_log_error("SQ header M5 tag discrepancy for reference '%s'",
hrecs->ref[ref_id].name);
hts_log_error("Please use the correct reference, or "
"consider using embed_ref=2");
return -1;
}
fd->refs->ref_id[ref_id]->validated_md5 = 1;
return 0;
}
int cram_encode_container(cram_fd *fd, cram_container *c) {
int i, j, slice_offset;
cram_block_compression_hdr *h = c->comp_hdr;
cram_block *c_hdr;
int multi_ref = 0;
int r1, r2, sn, nref, embed_ref, no_ref;
spare_bams *spares;
if (!c->bams)
goto err;
if (CRAM_MAJOR_VERS(fd->version) == 1)
goto err;
#define goto_err goto err
pthread_mutex_lock(&fd->ref_lock);
int failed_embed = (fd->no_ref_counter >= 5); if (!failed_embed && c->embed_ref == -2) {
hts_log_warning("Retrying embed_ref=2 mode for #%d/5", fd->no_ref_counter);
fd->no_ref = c->no_ref = 0;
fd->embed_ref = c->embed_ref = 2;
} else if (failed_embed && c->embed_ref == -2) {
hts_log_warning("Keeping non-ref mode from now on");
fd->embed_ref = c->embed_ref = 0;
}
pthread_mutex_unlock(&fd->ref_lock);
restart:
pthread_mutex_lock(&fd->ref_lock);
nref = fd->refs->nref;
pthread_mutex_unlock(&fd->ref_lock);
embed_ref = c->embed_ref;
no_ref = c->no_ref;
if (!no_ref) {
if (!c->bams || !c->curr_c_rec || !c->bams[0])
goto_err;
bam_seq_t *b = c->bams[0];
if (embed_ref <= 1) {
char *ref = cram_get_ref(fd, bam_ref(b), 1, 0);
if (!ref && bam_ref(b) >= 0) {
if (!c->pos_sorted) {
hts_log_warning("Failed to load reference #%d",
bam_ref(b));
hts_log_warning("Switching to non-ref mode");
pthread_mutex_lock(&fd->ref_lock);
c->embed_ref = fd->embed_ref = 0;
c->no_ref = fd->no_ref = 1;
pthread_mutex_unlock(&fd->ref_lock);
goto restart;
}
if (c->multi_seq || embed_ref == 0) {
hts_log_error("Failed to load reference #%d", bam_ref(b));
return -1;
}
hts_log_warning("Failed to load reference #%d", bam_ref(b));
hts_log_warning("Enabling embed_ref=2 mode to auto-generate"
" reference");
if (embed_ref <= 0)
hts_log_warning("NOTE: the CRAM file will be bigger than"
" using an external reference");
pthread_mutex_lock(&fd->ref_lock);
embed_ref = c->embed_ref = fd->embed_ref = 2;
pthread_mutex_unlock(&fd->ref_lock);
goto auto_ref;
} else if (ref) {
if (validate_md5(fd, c->ref_seq_id) < 0)
goto_err;
}
if ((c->ref_id = bam_ref(b)) >= 0) {
c->ref_seq_id = c->ref_id;
c->ref = fd->refs->ref_id[c->ref_seq_id]->seq;
c->ref_start = 1;
c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length;
}
} else {
auto_ref:
if ((c->ref_id = bam_ref(b)) >= 0) {
c->ref = NULL;
c->ref_free = 1;
}
}
c->ref_seq_id = c->ref_id;
} else {
c->ref_id = bam_ref(c->bams[0]);
cram_ref_incr(fd->refs, c->ref_id);
c->ref_seq_id = c->ref_id;
}
if (!no_ref && c->refs_used) {
for (i = 0; i < nref; i++) {
if (c->refs_used[i]) {
if (cram_get_ref(fd, i, 1, 0)) {
if (validate_md5(fd, i) < 0)
goto_err;
} else {
hts_log_warning("Failed to find reference, "
"switching to non-ref mode");
no_ref = c->no_ref = 1;
}
}
}
}
for (r1 = sn = 0; r1 < c->curr_c_rec; sn++) {
cram_slice *s = c->slices[sn];
int64_t first_base = INT64_MAX, last_base = INT64_MIN;
int r1_start = r1;
assert(sn < c->curr_slice);
if (lossy_read_names(fd, c, s, r1_start) != 0)
return -1;
kstring_t MD = {0};
if (embed_ref == 2) {
if (cram_generate_reference(c, s, r1) < 0) {
if (sn > 0) {
hts_log_error("Failed to build reference, "
"switching to non-ref mode");
return -1;
} else {
hts_log_warning("Failed to build reference, "
"switching to non-ref mode");
}
pthread_mutex_lock(&fd->ref_lock);
c->embed_ref = fd->embed_ref = -2; c->no_ref = fd->no_ref = 1;
fd->no_ref_counter++; pthread_mutex_unlock(&fd->ref_lock);
failed_embed = 1;
goto restart;
} else {
pthread_mutex_lock(&fd->ref_lock);
fd->no_ref_counter -= (fd->no_ref_counter > 0);
pthread_mutex_unlock(&fd->ref_lock);
}
}
for (r2 = 0; r1 < c->curr_c_rec && r2 < s->hdr->num_records; r1++, r2++) {
cram_record *cr = &s->crecs[r2];
bam_seq_t *b = c->bams[r1];
if (c->multi_seq && !no_ref) {
if (bam_ref(b) != c->ref_seq_id && bam_ref(b) >= 0) {
if (c->ref_seq_id >= 0)
cram_ref_decr(fd->refs, c->ref_seq_id);
if (!cram_get_ref(fd, bam_ref(b), 1, 0)) {
hts_log_error("Failed to load reference #%d", bam_ref(b));
free(MD.s);
return -1;
}
if (validate_md5(fd, bam_ref(b)) < 0)
return -1;
c->ref_seq_id = bam_ref(b); if (!fd->refs->ref_id[c->ref_seq_id]->seq)
return -1;
c->ref = fd->refs->ref_id[c->ref_seq_id]->seq;
c->ref_start = 1;
c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length;
}
}
if (process_one_read(fd, c, s, cr, b, r2, &MD, embed_ref,
no_ref) != 0) {
free(MD.s);
return -1;
}
if (first_base > cr->apos)
first_base = cr->apos;
if (last_base < cr->aend)
last_base = cr->aend;
}
free(MD.s);
if (add_read_names(fd, c, s, r1_start) < 0)
return -1;
if (c->multi_seq) {
s->hdr->ref_seq_id = -2;
s->hdr->ref_seq_start = 0;
s->hdr->ref_seq_span = 0;
} else if (c->ref_id == -1 && CRAM_ge31(fd->version)) {
s->hdr->ref_seq_id = -1;
s->hdr->ref_seq_start = 0;
s->hdr->ref_seq_span = 0;
} else {
s->hdr->ref_seq_id = c->ref_id;
s->hdr->ref_seq_start = first_base;
s->hdr->ref_seq_span = MAX(0, last_base - first_base + 1);
}
s->hdr->num_records = r2;
if (c->tags_used->n_occupied) {
int ntags = c->tags_used->n_occupied;
s->aux_block = calloc(ntags*2, sizeof(*s->aux_block));
if (!s->aux_block)
return -1;
khint_t k;
s->naux_block = 0;
for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
if (!kh_exist(c->tags_used, k))
continue;
cram_tag_map *tm = kh_val(c->tags_used, k);
if (!tm) goto_err;
if (!tm->blk) continue;
s->aux_block[s->naux_block++] = tm->blk;
tm->blk = NULL;
if (!tm->blk2) continue;
s->aux_block[s->naux_block++] = tm->blk2;
tm->blk2 = NULL;
}
assert(s->naux_block <= 2*c->tags_used->n_occupied);
}
}
if (c->multi_seq && !no_ref) {
if (c->ref_seq_id >= 0)
cram_ref_decr(fd->refs, c->ref_seq_id);
}
spares = malloc(sizeof(*spares));
if (!spares) goto_err;
pthread_mutex_lock(&fd->bam_list_lock);
spares->bams = c->bams;
spares->next = fd->bl;
fd->bl = spares;
pthread_mutex_unlock(&fd->bam_list_lock);
c->bams = NULL;
cram_stats_encoding(fd, c->stats[DS_RI]);
multi_ref = c->stats[DS_RI]->nvals > 1;
pthread_mutex_lock(&fd->metrics_lock);
fd->last_RI_count = c->stats[DS_RI]->nvals;
pthread_mutex_unlock(&fd->metrics_lock);
if (multi_ref) {
hts_log_info("Multi-ref container");
c->ref_seq_id = -2;
c->ref_seq_start = 0;
c->ref_seq_span = 0;
}
no_ref = c->no_ref;
int is_v4 = CRAM_MAJOR_VERS(fd->version) >= 4 ? 1 : 0;
for (i = 0; i < c->curr_slice; i++) {
cram_slice *s = c->slices[i];
if (CRAM_MAJOR_VERS(fd->version) != 1) {
if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !no_ref) {
hts_md5_context *md5 = hts_md5_init();
if (!md5)
return -1;
hts_md5_update(md5,
c->ref + s->hdr->ref_seq_start - c->ref_start,
s->hdr->ref_seq_span);
hts_md5_final(s->hdr->md5, md5);
hts_md5_destroy(md5);
} else {
memset(s->hdr->md5, 0, 16);
}
}
}
c->num_records = 0;
c->num_blocks = 1; c->length = 0;
h->codecs[DS_BF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BF]),
c->stats[DS_BF], E_INT, NULL,
fd->version, &fd->vv);
if (c->stats[DS_BF]->nvals && !h->codecs[DS_BF]) goto_err;
h->codecs[DS_CF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_CF]),
c->stats[DS_CF], E_INT, NULL,
fd->version, &fd->vv);
if (c->stats[DS_CF]->nvals && !h->codecs[DS_CF]) goto_err;
if (c->pos_sorted || CRAM_MAJOR_VERS(fd->version) >= 4) {
if (c->pos_sorted)
h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]),
c->stats[DS_AP],
is_v4 ? E_LONG : E_INT,
NULL, fd->version, &fd->vv);
else
h->codecs[DS_AP] = cram_encoder_init(is_v4 ? E_VARINT_SIGNED
: E_EXTERNAL,
NULL,
is_v4 ? E_LONG : E_INT,
NULL, fd->version, &fd->vv);
} else {
hts_pos_t p[2] = {0, c->max_apos};
h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL,
is_v4 ? E_LONG : E_INT,
p, fd->version, &fd->vv);
}
if (!h->codecs[DS_AP]) goto_err;
h->codecs[DS_RG] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RG]),
c->stats[DS_RG],
E_INT,
NULL,
fd->version, &fd->vv);
if (c->stats[DS_RG]->nvals && !h->codecs[DS_RG]) goto_err;
h->codecs[DS_MQ] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MQ]),
c->stats[DS_MQ], E_INT, NULL,
fd->version, &fd->vv);
if (c->stats[DS_MQ]->nvals && !h->codecs[DS_MQ]) goto_err;
h->codecs[DS_NS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NS]),
c->stats[DS_NS], E_INT, NULL,
fd->version, &fd->vv);
if (c->stats[DS_NS]->nvals && !h->codecs[DS_NS]) goto_err;
h->codecs[DS_MF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MF]),
c->stats[DS_MF], E_INT, NULL,
fd->version, &fd->vv);
if (c->stats[DS_MF]->nvals && !h->codecs[DS_MF]) goto_err;
h->codecs[DS_TS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TS]),
c->stats[DS_TS],
is_v4 ? E_LONG : E_INT,
NULL, fd->version, &fd->vv);
if (c->stats[DS_TS]->nvals && !h->codecs[DS_TS]) goto_err;
h->codecs[DS_NP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NP]),
c->stats[DS_NP],
is_v4 ? E_LONG : E_INT,
NULL, fd->version, &fd->vv);
if (c->stats[DS_NP]->nvals && !h->codecs[DS_NP]) goto_err;
h->codecs[DS_NF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NF]),
c->stats[DS_NF], E_INT, NULL,
fd->version, &fd->vv);
if (c->stats[DS_NF]->nvals && !h->codecs[DS_NF]) goto_err;
h->codecs[DS_RL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RL]),
c->stats[DS_RL], E_INT, NULL,
fd->version, &fd->vv);
if (c->stats[DS_RL]->nvals && !h->codecs[DS_RL]) goto_err;
h->codecs[DS_FN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FN]),
c->stats[DS_FN], E_INT, NULL,
fd->version, &fd->vv);
if (c->stats[DS_FN]->nvals && !h->codecs[DS_FN]) goto_err;
h->codecs[DS_FC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FC]),
c->stats[DS_FC], E_BYTE, NULL,
fd->version, &fd->vv);
if (c->stats[DS_FC]->nvals && !h->codecs[DS_FC]) goto_err;
h->codecs[DS_FP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FP]),
c->stats[DS_FP], E_INT, NULL,
fd->version, &fd->vv);
if (c->stats[DS_FP]->nvals && !h->codecs[DS_FP]) goto_err;
h->codecs[DS_DL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_DL]),
c->stats[DS_DL], E_INT, NULL,
fd->version, &fd->vv);
if (c->stats[DS_DL]->nvals && !h->codecs[DS_DL]) goto_err;
h->codecs[DS_BA] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BA]),
c->stats[DS_BA], E_BYTE, NULL,
fd->version, &fd->vv);
if (c->stats[DS_BA]->nvals && !h->codecs[DS_BA]) goto_err;
if (CRAM_MAJOR_VERS(fd->version) >= 3) {
cram_byte_array_len_encoder e;
e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4
? E_VARINT_UNSIGNED
: E_EXTERNAL;
e.len_dat = (void *)DS_BB_len;
e.val_encoding = E_EXTERNAL;
e.val_dat = (void *)DS_BB;
h->codecs[DS_BB] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
E_BYTE_ARRAY, (void *)&e,
fd->version, &fd->vv);
if (!h->codecs[DS_BB]) goto_err;
} else {
h->codecs[DS_BB] = NULL;
}
h->codecs[DS_BS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BS]),
c->stats[DS_BS], E_BYTE, NULL,
fd->version, &fd->vv);
if (c->stats[DS_BS]->nvals && !h->codecs[DS_BS]) goto_err;
if (CRAM_MAJOR_VERS(fd->version) == 1) {
h->codecs[DS_TL] = NULL;
h->codecs[DS_RI] = NULL;
h->codecs[DS_RS] = NULL;
h->codecs[DS_PD] = NULL;
h->codecs[DS_HC] = NULL;
h->codecs[DS_SC] = NULL;
h->codecs[DS_TC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TC]),
c->stats[DS_TC], E_BYTE, NULL,
fd->version, &fd->vv);
if (c->stats[DS_TC]->nvals && !h->codecs[DS_TC]) goto_err;
h->codecs[DS_TN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TN]),
c->stats[DS_TN], E_INT, NULL,
fd->version, &fd->vv);
if (c->stats[DS_TN]->nvals && !h->codecs[DS_TN]) goto_err;
} else {
h->codecs[DS_TC] = NULL;
h->codecs[DS_TN] = NULL;
h->codecs[DS_TL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TL]),
c->stats[DS_TL], E_INT, NULL,
fd->version, &fd->vv);
if (c->stats[DS_TL]->nvals && !h->codecs[DS_TL]) goto_err;
h->codecs[DS_RI] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RI]),
c->stats[DS_RI], E_INT, NULL,
fd->version, &fd->vv);
if (c->stats[DS_RI]->nvals && !h->codecs[DS_RI]) goto_err;
h->codecs[DS_RS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RS]),
c->stats[DS_RS], E_INT, NULL,
fd->version, &fd->vv);
if (c->stats[DS_RS]->nvals && !h->codecs[DS_RS]) goto_err;
h->codecs[DS_PD] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_PD]),
c->stats[DS_PD], E_INT, NULL,
fd->version, &fd->vv);
if (c->stats[DS_PD]->nvals && !h->codecs[DS_PD]) goto_err;
h->codecs[DS_HC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_HC]),
c->stats[DS_HC], E_INT, NULL,
fd->version, &fd->vv);
if (c->stats[DS_HC]->nvals && !h->codecs[DS_HC]) goto_err;
if (1) {
int i2[2] = {0, DS_SC};
h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
E_BYTE_ARRAY, (void *)i2,
fd->version, &fd->vv);
} else {
cram_byte_array_len_encoder e;
e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4
? E_VARINT_UNSIGNED
: E_EXTERNAL;
e.len_dat = (void *)DS_SC_len;
e.val_encoding = E_EXTERNAL;
e.val_dat = (void *)DS_SC;
h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
E_BYTE_ARRAY, (void *)&e,
fd->version, &fd->vv);
}
if (!h->codecs[DS_SC]) goto_err;
}
{
int i2[2] = {0, DS_IN};
h->codecs[DS_IN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
E_BYTE_ARRAY, (void *)i2,
fd->version, &fd->vv);
if (!h->codecs[DS_IN]) goto_err;
}
h->codecs[DS_QS] = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE,
(void *)DS_QS,
fd->version, &fd->vv);
if (!h->codecs[DS_QS]) goto_err;
{
int i2[2] = {0, DS_RN};
h->codecs[DS_RN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
E_BYTE_ARRAY, (void *)i2,
fd->version, &fd->vv);
if (!h->codecs[DS_RN]) goto_err;
}
for (i = 0; i < c->curr_slice; i++) {
hts_log_info("Encode slice %d", i);
int local_embed_ref =
embed_ref>0 && c->slices[i]->hdr->ref_seq_id != -1 ? 1 : 0;
if (cram_encode_slice(fd, c, h, c->slices[i], local_embed_ref) != 0)
return -1;
}
{
h->ref_seq_id = c->ref_seq_id;
h->ref_seq_start = c->ref_seq_start;
h->ref_seq_span = c->ref_seq_span;
h->num_records = c->num_records;
h->qs_seq_orient = c->qs_seq_orient;
h->AP_delta = c->pos_sorted;
memcpy(h->substitution_matrix, CRAM_SUBST_MATRIX, 20);
if (!(c_hdr = cram_encode_compression_header(fd, c, h, embed_ref)))
return -1;
}
c->num_landmarks = c->curr_slice;
c->landmark = malloc(c->num_landmarks * sizeof(*c->landmark));
if (!c->landmark)
return -1;
{
slice_offset = c_hdr->method == RAW
? c_hdr->uncomp_size
: c_hdr->comp_size;
slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
fd->vv.varint_size(c_hdr->content_id) +
fd->vv.varint_size(c_hdr->comp_size) +
fd->vv.varint_size(c_hdr->uncomp_size);
}
c->ref_seq_id = c->slices[0]->hdr->ref_seq_id;
if (c->ref_seq_id == -1 && CRAM_ge31(fd->version)) {
c->ref_seq_start = 0;
c->ref_seq_span = 0;
} else {
c->ref_seq_start = c->slices[0]->hdr->ref_seq_start;
c->ref_seq_span = c->slices[0]->hdr->ref_seq_span;
}
for (i = 0; i < c->curr_slice; i++) {
cram_slice *s = c->slices[i];
c->num_blocks += s->hdr->num_blocks + 1; c->landmark[i] = slice_offset;
if (s->hdr->ref_seq_start + s->hdr->ref_seq_span >
c->ref_seq_start + c->ref_seq_span) {
c->ref_seq_span = s->hdr->ref_seq_start + s->hdr->ref_seq_span
- c->ref_seq_start;
}
slice_offset += s->hdr_block->method == RAW
? s->hdr_block->uncomp_size
: s->hdr_block->comp_size;
slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
fd->vv.varint_size(s->hdr_block->content_id) +
fd->vv.varint_size(s->hdr_block->comp_size) +
fd->vv.varint_size(s->hdr_block->uncomp_size);
for (j = 0; j < s->hdr->num_blocks; j++) {
slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
fd->vv.varint_size(s->block[j]->content_id) +
fd->vv.varint_size(s->block[j]->comp_size) +
fd->vv.varint_size(s->block[j]->uncomp_size);
slice_offset += s->block[j]->method == RAW
? s->block[j]->uncomp_size
: s->block[j]->comp_size;
}
}
c->length += slice_offset;
c->comp_hdr_block = c_hdr;
if (c->ref_seq_id >= 0) {
if (c->ref_free) {
free(c->ref);
c->ref = NULL;
} else {
cram_ref_decr(fd->refs, c->ref_seq_id);
}
}
if (!no_ref && c->refs_used) {
for (i = 0; i < fd->refs->nref; i++) {
if (c->refs_used[i])
cram_ref_decr(fd->refs, i);
}
}
return 0;
err:
return -1;
}
static int cram_add_feature(cram_container *c, cram_slice *s,
cram_record *r, cram_feature *f) {
if (s->nfeatures >= s->afeatures) {
s->afeatures = s->afeatures ? s->afeatures*2 : 1024;
s->features = realloc(s->features, s->afeatures*sizeof(*s->features));
if (!s->features)
return -1;
}
if (!r->nfeature++) {
r->feature = s->nfeatures;
if (cram_stats_add(c->stats[DS_FP], f->X.pos) < 0)
return -1;
} else {
if (cram_stats_add(c->stats[DS_FP],
f->X.pos - s->features[r->feature + r->nfeature-2].X.pos) < 0)
return -1;
}
if (cram_stats_add(c->stats[DS_FC], f->X.code) < 0)
return -1;
s->features[s->nfeatures++] = *f;
return 0;
}
static int cram_add_substitution(cram_fd *fd, cram_container *c,
cram_slice *s, cram_record *r,
int pos, char base, char qual, char ref) {
cram_feature f;
if (fd->L2[(uc)base]<4 || (fd->L2[(uc)base]<5 && fd->L2[(uc)ref]<4)) {
f.X.pos = pos+1;
f.X.code = 'X';
f.X.base = fd->cram_sub_matrix[ref&0x1f][base&0x1f];
if (cram_stats_add(c->stats[DS_BS], f.X.base) < 0)
return -1;
} else {
f.B.pos = pos+1;
f.B.code = 'B';
f.B.base = base;
f.B.qual = qual;
if (cram_stats_add(c->stats[DS_BA], f.B.base) < 0) return -1;
if (cram_stats_add(c->stats[DS_QS], f.B.qual) < 0) return -1;
BLOCK_APPEND_CHAR(s->qual_blk, qual);
}
return cram_add_feature(c, s, r, &f);
block_err:
return -1;
}
static int cram_add_bases(cram_fd *fd, cram_container *c,
cram_slice *s, cram_record *r,
int pos, int len, char *base) {
cram_feature f;
f.b.pos = pos+1;
f.b.code = 'b';
f.b.seq_idx = base - (char *)BLOCK_DATA(s->seqs_blk);
f.b.len = len;
return cram_add_feature(c, s, r, &f);
}
static int cram_add_base(cram_fd *fd, cram_container *c,
cram_slice *s, cram_record *r,
int pos, char base, char qual) {
cram_feature f;
f.B.pos = pos+1;
f.B.code = 'B';
f.B.base = base;
f.B.qual = qual;
if (cram_stats_add(c->stats[DS_BA], base) < 0) return -1;
if (cram_stats_add(c->stats[DS_QS], qual) < 0) return -1;
BLOCK_APPEND_CHAR(s->qual_blk, qual);
return cram_add_feature(c, s, r, &f);
block_err:
return -1;
}
static int cram_add_quality(cram_fd *fd, cram_container *c,
cram_slice *s, cram_record *r,
int pos, char qual) {
cram_feature f;
f.Q.pos = pos+1;
f.Q.code = 'Q';
f.Q.qual = qual;
if (cram_stats_add(c->stats[DS_QS], qual) < 0) return -1;
BLOCK_APPEND_CHAR(s->qual_blk, qual);
return cram_add_feature(c, s, r, &f);
block_err:
return -1;
}
static int cram_add_deletion(cram_container *c, cram_slice *s, cram_record *r,
int pos, int len, char *base) {
cram_feature f;
f.D.pos = pos+1;
f.D.code = 'D';
f.D.len = len;
if (cram_stats_add(c->stats[DS_DL], len) < 0) return -1;
return cram_add_feature(c, s, r, &f);
}
static int cram_add_softclip(cram_container *c, cram_slice *s, cram_record *r,
int pos, int len, char *base, int version) {
cram_feature f;
f.S.pos = pos+1;
f.S.code = 'S';
f.S.len = len;
switch (CRAM_MAJOR_VERS(version)) {
case 1:
f.S.seq_idx = BLOCK_SIZE(s->base_blk);
BLOCK_APPEND(s->base_blk, base, len);
BLOCK_APPEND_CHAR(s->base_blk, '\0');
break;
case 2:
default:
f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
if (base) {
BLOCK_APPEND(s->soft_blk, base, len);
} else {
int i;
for (i = 0; i < len; i++)
BLOCK_APPEND_CHAR(s->soft_blk, 'N');
}
BLOCK_APPEND_CHAR(s->soft_blk, '\0');
break;
}
return cram_add_feature(c, s, r, &f);
block_err:
return -1;
}
static int cram_add_hardclip(cram_container *c, cram_slice *s, cram_record *r,
int pos, int len, char *base) {
cram_feature f;
f.S.pos = pos+1;
f.S.code = 'H';
f.S.len = len;
if (cram_stats_add(c->stats[DS_HC], len) < 0) return -1;
return cram_add_feature(c, s, r, &f);
}
static int cram_add_skip(cram_container *c, cram_slice *s, cram_record *r,
int pos, int len, char *base) {
cram_feature f;
f.S.pos = pos+1;
f.S.code = 'N';
f.S.len = len;
if (cram_stats_add(c->stats[DS_RS], len) < 0) return -1;
return cram_add_feature(c, s, r, &f);
}
static int cram_add_pad(cram_container *c, cram_slice *s, cram_record *r,
int pos, int len, char *base) {
cram_feature f;
f.S.pos = pos+1;
f.S.code = 'P';
f.S.len = len;
if (cram_stats_add(c->stats[DS_PD], len) < 0) return -1;
return cram_add_feature(c, s, r, &f);
}
static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r,
int pos, int len, char *base) {
cram_feature f;
f.I.pos = pos+1;
if (len == 1) {
char b = base ? *base : 'N';
f.i.code = 'i';
f.i.base = b;
if (cram_stats_add(c->stats[DS_BA], b) < 0) return -1;
} else {
f.I.code = 'I';
f.I.len = len;
f.S.seq_idx = BLOCK_SIZE(s->base_blk);
if (base) {
BLOCK_APPEND(s->base_blk, base, len);
} else {
int i;
for (i = 0; i < len; i++)
BLOCK_APPEND_CHAR(s->base_blk, 'N');
}
BLOCK_APPEND_CHAR(s->base_blk, '\0');
}
return cram_add_feature(c, s, r, &f);
block_err:
return -1;
}
static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b,
cram_container *c,
cram_slice *s, cram_record *cr,
int verbatim_NM, int verbatim_MD,
int NM, kstring_t *MD, int cf_tag,
int no_ref, int *err) {
char *aux, *orig;
sam_hrec_rg_t *brg = NULL;
int aux_size = bam_get_l_aux(b);
const char *aux_end = bam_data_end(b);
cram_block *td_b = c->comp_hdr->TD_blk;
int TD_blk_size = BLOCK_SIZE(td_b), new;
char *key;
khint_t k;
if (err) *err = 1;
orig = aux = (char *)bam_aux(b);
if (cf_tag && CRAM_MAJOR_VERS(fd->version) < 4) {
aux = malloc(aux_size+4);
if (!aux)
return NULL;
memcpy(aux, orig, aux_size);
aux[aux_size++] = 'c';
aux[aux_size++] = 'F';
aux[aux_size++] = 'C';
aux[aux_size++] = cf_tag;
orig = aux;
aux_end = aux + aux_size;
}
while (aux_end - aux >= 1 && aux[0] != 0) {
int r;
if (aux - orig >= aux_size - 3)
goto err;
if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') {
char *rg = &aux[3];
brg = sam_hrecs_find_rg(fd->header->hrecs, rg);
if (brg) {
while (aux < aux_end && *aux++);
if (CRAM_MAJOR_VERS(fd->version) >= 4)
BLOCK_APPEND(td_b, "RG*", 3);
continue;
} else {
hts_log_warning("Missing @RG header for RG \"%s\"", rg);
}
}
if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') {
if (cr->len && !no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_MD) {
if (MD && MD->s && strncasecmp(MD->s, aux+3, orig + aux_size - (aux+3)) == 0) {
while (aux < aux_end && *aux++);
if (CRAM_MAJOR_VERS(fd->version) >= 4)
BLOCK_APPEND(td_b, "MD*", 3);
continue;
}
}
}
if (aux[0] == 'N' && aux[1] == 'M') {
if (cr->len && !no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_NM) {
int NM_ = bam_aux2i_end((uint8_t *)aux+2, (uint8_t *)aux_end);
if (NM_ == NM) {
switch(aux[2]) {
case 'A': case 'C': case 'c': aux+=4; break;
case 'S': case 's': aux+=5; break;
case 'I': case 'i': case 'f': aux+=7; break;
default:
hts_log_error("Unhandled type code for NM tag");
goto err;
}
if (CRAM_MAJOR_VERS(fd->version) >= 4)
BLOCK_APPEND(td_b, "NM*", 3);
continue;
}
}
}
BLOCK_APPEND(td_b, aux, 3);
int key = (((unsigned char *) aux)[0]<<16 |
((unsigned char *) aux)[1]<<8 |
((unsigned char *) aux)[2]);
k = kh_put(m_tagmap, c->tags_used, key, &r);
if (-1 == r)
goto err;
else if (r != 0)
kh_val(c->tags_used, k) = NULL;
if (r == 1) {
khint_t k_global;
pthread_mutex_lock(&fd->metrics_lock);
k_global = kh_put(m_metrics, fd->tags_used, key, &r);
if (-1 == r) {
pthread_mutex_unlock(&fd->metrics_lock);
goto err;
}
if (r >= 1) {
kh_val(fd->tags_used, k_global) = cram_new_metrics();
if (!kh_val(fd->tags_used, k_global)) {
kh_del(m_metrics, fd->tags_used, k_global);
pthread_mutex_unlock(&fd->metrics_lock);
goto err;
}
}
pthread_mutex_unlock(&fd->metrics_lock);
int i2[2] = {'\t',key};
size_t sk = key;
cram_tag_map *m = calloc(1, sizeof(*m));
if (!m)
goto_err;
kh_val(c->tags_used, k) = m;
cram_codec *c;
switch(aux[2]) {
case 'Z': case 'H':
c = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
E_BYTE_ARRAY, (void *)i2,
fd->version, &fd->vv);
break;
case 'A': case 'c': case 'C': {
cram_byte_array_len_encoder e;
cram_stats st;
if (CRAM_MAJOR_VERS(fd->version) <= 3) {
e.len_encoding = E_HUFFMAN;
e.len_dat = NULL; } else {
e.len_encoding = E_CONST_INT;
e.len_dat = NULL; }
memset(&st, 0, sizeof(st));
if (cram_stats_add(&st, 1) < 0) goto block_err;
cram_stats_encoding(fd, &st);
e.val_encoding = E_EXTERNAL;
e.val_dat = (void *)sk;
c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
E_BYTE_ARRAY, (void *)&e,
fd->version, &fd->vv);
break;
}
case 's': case 'S': {
cram_byte_array_len_encoder e;
cram_stats st;
if (CRAM_MAJOR_VERS(fd->version) <= 3) {
e.len_encoding = E_HUFFMAN;
e.len_dat = NULL; } else {
e.len_encoding = E_CONST_INT;
e.len_dat = NULL; }
memset(&st, 0, sizeof(st));
if (cram_stats_add(&st, 2) < 0) goto block_err;
cram_stats_encoding(fd, &st);
e.val_encoding = E_EXTERNAL;
e.val_dat = (void *)sk;
c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
E_BYTE_ARRAY, (void *)&e,
fd->version, &fd->vv);
break;
}
case 'i': case 'I': case 'f': {
cram_byte_array_len_encoder e;
cram_stats st;
if (CRAM_MAJOR_VERS(fd->version) <= 3) {
e.len_encoding = E_HUFFMAN;
e.len_dat = NULL; } else {
e.len_encoding = E_CONST_INT;
e.len_dat = NULL; }
memset(&st, 0, sizeof(st));
if (cram_stats_add(&st, 4) < 0) goto block_err;
cram_stats_encoding(fd, &st);
e.val_encoding = E_EXTERNAL;
e.val_dat = (void *)sk;
c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
E_BYTE_ARRAY, (void *)&e,
fd->version, &fd->vv);
break;
}
case 'B': {
cram_byte_array_len_encoder e;
e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4
? E_VARINT_UNSIGNED
: E_EXTERNAL;
e.len_dat = (void *)sk;
e.val_encoding = E_EXTERNAL;
e.val_dat = (void *)sk;
c = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
E_BYTE_ARRAY, (void *)&e,
fd->version, &fd->vv);
break;
}
default:
hts_log_error("Unsupported SAM aux type '%c'", aux[2]);
c = NULL;
}
if (!c)
goto_err;
m->codec = c;
pthread_mutex_lock(&fd->metrics_lock);
m->m = k_global ? (cram_metrics *)kh_val(fd->tags_used, k_global) : NULL;
pthread_mutex_unlock(&fd->metrics_lock);
}
cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k);
if (!tm) goto_err;
cram_codec *codec = tm->codec;
if (!tm->codec) goto_err;
switch(aux[2]) {
case 'A': case 'C': case 'c':
if (aux_end - aux < 3+1)
goto err;
if (!tm->blk) {
if (!(tm->blk = cram_new_block(EXTERNAL, key)))
goto err;
codec->u.e_byte_array_len.val_codec->out = tm->blk;
}
aux+=3;
BLOCK_APPEND_CHAR(tm->blk, *aux);
aux++;
break;
case 'S': case 's':
if (aux_end - aux < 3+2)
goto err;
if (!tm->blk) {
if (!(tm->blk = cram_new_block(EXTERNAL, key)))
goto err;
codec->u.e_byte_array_len.val_codec->out = tm->blk;
}
aux+=3;
BLOCK_APPEND(tm->blk, aux, 2);
aux+=2;
break;
case 'I': case 'i': case 'f':
if (aux_end - aux < 3+4)
goto err;
if (!tm->blk) {
if (!(tm->blk = cram_new_block(EXTERNAL, key)))
goto err;
codec->u.e_byte_array_len.val_codec->out = tm->blk;
}
aux+=3;
BLOCK_APPEND(tm->blk, aux, 4);
aux+=4;
break;
case 'd':
if (aux_end - aux < 3+8)
goto err;
if (!tm->blk) {
if (!(tm->blk = cram_new_block(EXTERNAL, key)))
goto err;
codec->u.e_byte_array_len.val_codec->out = tm->blk;
}
aux+=3; BLOCK_APPEND(tm->blk, aux, 8);
aux+=8;
break;
case 'Z': case 'H': {
if (aux_end - aux < 3)
goto err;
if (!tm->blk) {
if (!(tm->blk = cram_new_block(EXTERNAL, key)))
goto err;
codec->out = tm->blk;
}
char *aux_s;
aux += 3;
aux_s = aux;
while (aux < aux_end && *aux++);
if (codec->encode(s, codec, aux_s, aux - aux_s) < 0)
goto err;
break;
}
case 'B': {
if (aux_end - aux < 4+4)
goto err;
int type = aux[3];
uint64_t count = (((uint64_t)((unsigned char *)aux)[4]) << 0 |
((uint64_t)((unsigned char *)aux)[5]) << 8 |
((uint64_t)((unsigned char *)aux)[6]) <<16 |
((uint64_t)((unsigned char *)aux)[7]) <<24);
uint64_t blen;
if (!tm->blk) {
if (!(tm->blk = cram_new_block(EXTERNAL, key)))
goto err;
if (codec->u.e_byte_array_len.val_codec->codec == E_XDELTA) {
if (!(tm->blk2 = cram_new_block(EXTERNAL, key+128)))
goto err;
codec->u.e_byte_array_len.len_codec->out = tm->blk2;
codec->u.e_byte_array_len.val_codec->u.e_xdelta.sub_codec->out = tm->blk;
} else {
codec->u.e_byte_array_len.len_codec->out = tm->blk;
codec->u.e_byte_array_len.val_codec->out = tm->blk;
}
}
aux+=3;
switch (type) {
case 'c': case 'C':
blen = count;
break;
case 's': case 'S':
blen = 2*count;
break;
case 'i': case 'I': case 'f':
blen = 4*count;
break;
default:
hts_log_error("Unknown sub-type '%c' for aux type 'B'", type);
goto err;
}
blen += 5; if (aux_end - aux < blen || blen > INT_MAX)
goto err;
if (codec->encode(s, codec, aux, (int) blen) < 0)
goto err;
aux += blen;
break;
}
default:
hts_log_error("Unknown aux type '%c'", aux_end - aux < 2 ? '?' : aux[2]);
goto err;
}
tm->blk->m = tm->m;
}
BLOCK_APPEND_CHAR(td_b, 0);
key = string_ndup(c->comp_hdr->TD_keys,
(char *)BLOCK_DATA(td_b) + TD_blk_size,
BLOCK_SIZE(td_b) - TD_blk_size);
if (!key)
goto block_err;
k = kh_put(m_s2i, c->comp_hdr->TD_hash, key, &new);
if (new < 0) {
goto err;
} else if (new == 0) {
BLOCK_SIZE(td_b) = TD_blk_size;
} else {
kh_val(c->comp_hdr->TD_hash, k) = c->comp_hdr->nTL;
c->comp_hdr->nTL++;
}
cr->TL = kh_val(c->comp_hdr->TD_hash, k);
if (cram_stats_add(c->stats[DS_TL], cr->TL) < 0)
goto block_err;
if (orig != (char *)bam_aux(b))
free(orig);
if (err) *err = 0;
return brg;
err:
block_err:
if (orig != (char *)bam_aux(b))
free(orig);
return NULL;
}
void cram_update_curr_slice(cram_container *c, int version) {
cram_slice *s = c->slice;
if (c->multi_seq) {
s->hdr->ref_seq_id = -2;
s->hdr->ref_seq_start = 0;
s->hdr->ref_seq_span = 0;
} else if (c->curr_ref == -1 && CRAM_ge31(version)) {
s->hdr->ref_seq_id = -1;
s->hdr->ref_seq_start = 0;
s->hdr->ref_seq_span = 0;
} else {
s->hdr->ref_seq_id = c->curr_ref;
s->hdr->ref_seq_start = c->first_base;
s->hdr->ref_seq_span = MAX(0, c->last_base - c->first_base + 1);
}
s->hdr->num_records = c->curr_rec;
if (c->curr_slice == 0) {
if (c->ref_seq_id != s->hdr->ref_seq_id)
c->ref_seq_id = s->hdr->ref_seq_id;
c->ref_seq_start = c->first_base;
}
c->curr_slice++;
}
static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) {
cram_container *c = fd->ctr;
int i;
if (c->curr_ref == -2)
c->curr_ref = bam_ref(b);
if (c->slice)
cram_update_curr_slice(c, fd->version);
if (c->curr_slice == c->max_slice ||
(bam_ref(b) != c->curr_ref && !c->multi_seq)) {
c->ref_seq_span = fd->last_base - c->ref_seq_start + 1;
hts_log_info("Flush container %d/%"PRId64"..%"PRId64,
c->ref_seq_id, c->ref_seq_start,
c->ref_seq_start + c->ref_seq_span -1);
if (-1 == cram_flush_container_mt(fd, c))
return NULL;
if (!fd->pool) {
for (i = 0; i < c->max_slice; i++) {
cram_free_slice(c->slices[i]);
c->slices[i] = NULL;
}
c->slice = NULL;
c->curr_slice = 0;
cram_free_container(c);
}
c = fd->ctr = cram_new_container(fd->seqs_per_slice,
fd->slices_per_container);
if (!c)
return NULL;
pthread_mutex_lock(&fd->ref_lock);
c->no_ref = fd->no_ref;
c->embed_ref = fd->embed_ref;
c->record_counter = fd->record_counter;
pthread_mutex_unlock(&fd->ref_lock);
c->curr_ref = bam_ref(b);
}
c->last_pos = c->first_base = c->last_base = bam_pos(b)+1;
c->slice = c->slices[c->curr_slice] =
cram_new_slice(MAPPED_SLICE, c->max_rec);
if (!c->slice)
return NULL;
if (c->multi_seq) {
c->slice->hdr->ref_seq_id = -2;
c->slice->hdr->ref_seq_start = 0;
c->slice->last_apos = 1;
} else {
c->slice->hdr->ref_seq_id = bam_ref(b);
c->slice->hdr->ref_seq_start = bam_pos(b)+1;
c->slice->last_apos = bam_pos(b)+1;
}
c->curr_rec = 0;
c->s_num_bases = 0;
c->n_mapped = 0;
c->qs_seq_orient = CRAM_MAJOR_VERS(fd->version) >= 4 ? 0 : 1;
return c;
}
static int process_one_read(cram_fd *fd, cram_container *c,
cram_slice *s, cram_record *cr,
bam_seq_t *b, int rnum, kstring_t *MD,
int embed_ref, int no_ref) {
int i, fake_qual = -1, NM = 0;
char *cp;
char *ref, *seq, *qual;
int verbatim_NM = fd->store_nm;
int verbatim_MD = fd->store_md;
cr->flags = bam_flag(b);
cr->len = bam_seq_len(b);
uint8_t *md;
if (!(md = bam_aux_get(b, "MD")))
MD = NULL;
else
MD->l = 0;
int cf_tag = 0;
if (embed_ref == 2) {
cf_tag = MD ? 0 : 1; cf_tag |= bam_aux_get(b, "NM") ? 0 : 2; }
ref = c->ref ? c->ref - (c->ref_start-1) : NULL;
cr->ref_id = bam_ref(b);
if (cram_stats_add(c->stats[DS_RI], cr->ref_id) < 0)
goto block_err;
if (cram_stats_add(c->stats[DS_BF], fd->cram_flag_swap[cr->flags & 0xfff]) < 0)
goto block_err;
if (!no_ref || CRAM_MAJOR_VERS(fd->version) >= 3)
cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES;
if (cr->len <= 0 && CRAM_MAJOR_VERS(fd->version) >= 3)
cr->cram_flags |= CRAM_FLAG_NO_SEQ;
c->num_bases += cr->len;
cr->apos = bam_pos(b)+1;
if (c->pos_sorted) {
if (cr->apos < s->last_apos && !fd->ap_delta) {
c->pos_sorted = 0;
} else {
if (cram_stats_add(c->stats[DS_AP], cr->apos - s->last_apos) < 0)
goto block_err;
s->last_apos = cr->apos;
}
} else {
}
c->max_apos += (cr->apos > c->max_apos) * (cr->apos - c->max_apos);
cr->seq = BLOCK_SIZE(s->seqs_blk);
cr->qual = BLOCK_SIZE(s->qual_blk);
BLOCK_GROW(s->seqs_blk, cr->len+1);
BLOCK_GROW(s->qual_blk, cr->len);
seq = cp = (char *)BLOCK_END(s->seqs_blk);
*seq = 0;
nibble2base(bam_seq(b), cp, cr->len);
BLOCK_SIZE(s->seqs_blk) += cr->len;
qual = cp = (char *)bam_qual(b);
if (!(cr->flags & BAM_FUNMAP)) {
uint32_t *cig_to, *cig_from;
int64_t apos = cr->apos-1, spos = 0;
int64_t MD_last = apos;
cr->cigar = s->ncigar;
cr->ncigar = bam_cigar_len(b);
while (cr->cigar + cr->ncigar >= s->cigar_alloc) {
s->cigar_alloc = s->cigar_alloc ? s->cigar_alloc*2 : 1024;
s->cigar = realloc(s->cigar, s->cigar_alloc * sizeof(*s->cigar));
if (!s->cigar)
return -1;
}
cig_to = (uint32_t *)s->cigar;
cig_from = (uint32_t *)bam_cigar(b);
cr->feature = 0;
cr->nfeature = 0;
for (i = 0; i < cr->ncigar; i++) {
enum cigar_op cig_op = cig_from[i] & BAM_CIGAR_MASK;
uint32_t cig_len = cig_from[i] >> BAM_CIGAR_SHIFT;
cig_to[i] = cig_from[i];
switch (cig_op) {
int l;
case BAM_CMATCH:
case BAM_CBASE_MATCH:
case BAM_CBASE_MISMATCH:
l = 0;
if (!no_ref && cr->len) {
int end = cig_len+apos < c->ref_end
? cig_len : c->ref_end - apos;
char *sp = &seq[spos];
char *rp = &ref[apos];
char *qp = &qual[spos];
if (end > cr->len) {
hts_log_error("CIGAR and query sequence are of different length");
return -1;
}
for (l = 0; l < end; l++) {
if (rp[l] == 'N' && sp[l] == 'N')
verbatim_NM = verbatim_MD = 1;
if (rp[l] != sp[l]) {
if (MD && ref) {
if (kputuw(apos+l - MD_last, MD) < 0) goto err;
if (kputc(rp[l], MD) < 0) goto err;
MD_last = apos+l+1;
}
NM++;
if (!sp[l])
break;
if (0 && CRAM_MAJOR_VERS(fd->version) >= 3) {
#if 0#else
int nl = l;
int max_end = nl, max_score = 0, score = 0;
while (nl < end) {
if (rp[nl] != sp[nl]) {
score += 3;
if (max_score < score) {
max_score = score;
max_end = nl;
}
} else {
score--;
if (score < -2 ||
max_score - score > 7)
break;
}
nl++;
}
if (max_score > 20) {
cram_add_bases(fd, c, s, cr, spos+l,
max_end-l, &seq[spos+l]);
l = max_end-1;
} else {
while (l < nl) {
if (rp[l] != sp[l])
cram_add_substitution(fd, c, s,
cr, spos+l,
sp[l], qp[l],
rp[l]);
l++;
}
l--;
}
#endif
} else {
if (cram_add_substitution(fd, c, s, cr, spos+l,
sp[l], qp[l], rp[l]))
return -1;
}
}
}
spos += l;
apos += l;
}
if (l < cig_len && cr->len) {
if (no_ref) {
if (CRAM_MAJOR_VERS(fd->version) == 3) {
if (cram_add_bases(fd, c, s, cr, spos,
cig_len-l, &seq[spos]))
return -1;
spos += cig_len-l;
} else {
for (; l < cig_len && seq[spos]; l++, spos++) {
if (cram_add_base(fd, c, s, cr, spos,
seq[spos], qual[spos]))
return -1;
}
}
} else {
verbatim_NM = verbatim_MD = 1;
for (; l < cig_len && seq[spos]; l++, spos++) {
if (cram_add_base(fd, c, s, cr, spos,
seq[spos], qual[spos]))
return -1;
}
}
apos += cig_len;
} else if (!cr->len) {
verbatim_NM = verbatim_MD = 1;
apos += cig_len;
spos += cig_len;
}
break;
case BAM_CDEL:
if (MD && ref) {
if (kputuw(apos - MD_last, MD) < 0) goto err;
if (apos < c->ref_end) {
if (kputc_('^', MD) < 0) goto err;
if (kputsn(&ref[apos], MIN(c->ref_end - apos, cig_len), MD) < 0)
goto err;
}
}
NM += cig_len;
if (cram_add_deletion(c, s, cr, spos, cig_len, &seq[spos]))
return -1;
apos += cig_len;
MD_last = apos;
break;
case BAM_CREF_SKIP:
if (cram_add_skip(c, s, cr, spos, cig_len, &seq[spos]))
return -1;
apos += cig_len;
MD_last += cig_len;
break;
case BAM_CINS:
if (cram_add_insertion(c, s, cr, spos, cig_len,
cr->len ? &seq[spos] : NULL))
return -1;
if (no_ref && cr->len) {
for (l = 0; l < cig_len; l++, spos++) {
cram_add_quality(fd, c, s, cr, spos, qual[spos]);
}
} else {
spos += cig_len;
}
NM += cig_len;
break;
case BAM_CSOFT_CLIP:
if (cram_add_softclip(c, s, cr, spos, cig_len,
cr->len ? &seq[spos] : NULL,
fd->version))
return -1;
if (no_ref &&
!(cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
if (cr->len) {
for (l = 0; l < cig_len; l++, spos++) {
cram_add_quality(fd, c, s, cr, spos, qual[spos]);
}
} else {
for (l = 0; l < cig_len; l++, spos++) {
cram_add_quality(fd, c, s, cr, spos, -1);
}
}
} else {
spos += cig_len;
}
break;
case BAM_CHARD_CLIP:
if (cram_add_hardclip(c, s, cr, spos, cig_len, &seq[spos]))
return -1;
break;
case BAM_CPAD:
if (cram_add_pad(c, s, cr, spos, cig_len, &seq[spos]))
return -1;
break;
default:
hts_log_error("Unknown CIGAR op code %d", cig_op);
return -1;
}
}
if (cr->len && spos != cr->len) {
hts_log_error("CIGAR and query sequence are of different length");
return -1;
}
fake_qual = spos;
cr->aend = no_ref ? apos : MIN(apos, c->ref_end);
if (cram_stats_add(c->stats[DS_FN], cr->nfeature) < 0)
goto block_err;
if (MD && ref)
if (kputuw(apos - MD_last, MD) < 0) goto err;
} else {
cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES;
cr->cigar = 0;
cr->ncigar = 0;
cr->nfeature = 0;
cr->aend = MIN(cr->apos, c->ref_end);
for (i = 0; i < cr->len; i++)
if (cram_stats_add(c->stats[DS_BA], seq[i]) < 0)
goto block_err;
fake_qual = 0;
}
cr->ntags = 0; int err = 0;
sam_hrec_rg_t *brg =
cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD,
cf_tag, no_ref, &err);
if (err)
goto block_err;
if (brg) {
cr->rg = brg->id;
} else if (CRAM_MAJOR_VERS(fd->version) == 1) {
sam_hrec_rg_t *brg = sam_hrecs_find_rg(fd->header->hrecs, "UNKNOWN");
if (!brg) goto block_err;
cr->rg = brg->id;
} else {
cr->rg = -1;
}
if (cram_stats_add(c->stats[DS_RG], cr->rg) < 0)
goto block_err;
if (cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES) {
if (cr->len == 0) {
cr->len = fake_qual;
BLOCK_GROW(s->qual_blk, cr->len);
cp = (char *)BLOCK_END(s->qual_blk);
memset(cp, 255, cr->len);
} else {
BLOCK_GROW(s->qual_blk, cr->len);
cp = (char *)BLOCK_END(s->qual_blk);
char *from = (char *)&bam_qual(b)[0];
char *to = &cp[0];
memcpy(to, from, cr->len);
if (!c->qs_seq_orient) {
if (cr->flags & BAM_FREVERSE) {
int i, j;
for (i = 0, j = cr->len-1; i < j; i++, j--) {
unsigned char c;
c = to[i];
to[i] = to[j];
to[j] = c;
}
}
}
}
BLOCK_SIZE(s->qual_blk) += cr->len;
} else {
if (cr->len == 0)
cr->len = fake_qual >= 0 ? fake_qual : cr->aend - cr->apos + 1;
}
if (cram_stats_add(c->stats[DS_RL], cr->len) < 0)
goto block_err;
{
int new;
khint_t k;
int sec = (cr->flags & BAM_FSECONDARY) ? 1 : 0;
if (cr->flags & BAM_FPAIRED) {
char *key = string_ndup(s->pair_keys, bam_name(b), bam_name_len(b));
if (!key)
return -1;
k = kh_put(m_s2i, s->pair[sec], key, &new);
if (-1 == new)
return -1;
else if (new > 0)
kh_val(s->pair[sec], k) = rnum;
} else {
new = 1;
k = 0; }
if (new == 0) {
cram_record *p = &s->crecs[kh_val(s->pair[sec], k)];
int64_t aleft, aright;
int sign;
aleft = MIN(cr->apos, p->apos);
aright = MAX(cr->aend, p->aend);
if (cr->apos < p->apos) {
sign = 1;
} else if (cr->apos > p->apos) {
sign = -1;
} else if (cr->flags & BAM_FREAD1) {
sign = 1;
} else {
sign = -1;
}
if ((!fd->tlen_zero && MAX(bam_mate_pos(b)+1, 0) != p->apos) &&
!(fd->tlen_zero && bam_mate_pos(b) == 0))
goto detached;
if (((bam_flag(b) & BAM_FMUNMAP) != 0) !=
((p->flags & BAM_FUNMAP) != 0))
goto detached;
if (((bam_flag(b) & BAM_FMREVERSE) != 0) !=
((p->flags & BAM_FREVERSE) != 0))
goto detached;
if (p->ref_id != cr->ref_id &&
!(fd->tlen_zero && p->ref_id == -1))
goto detached;
if (p->mate_pos != cr->apos &&
!(fd->tlen_zero && p->mate_pos == 0))
goto detached;
if (((p->flags & BAM_FMUNMAP) != 0) !=
((p->mate_flags & CRAM_M_UNMAP) != 0))
goto detached;
if (((p->flags & BAM_FMREVERSE) != 0) !=
((p->mate_flags & CRAM_M_REVERSE) != 0))
goto detached;
if ((cr->flags & BAM_FSUPPLEMENTARY) ||
(p->flags & BAM_FSUPPLEMENTARY))
goto detached;
if (fd->lossy_read_names &&
(!(cr->cram_flags & CRAM_FLAG_DISCARD_NAME) ||
!((p->cram_flags & CRAM_FLAG_DISCARD_NAME))))
goto detached;
int explicit_tlen = 0;
int tflag1 = ((bam_ins_size(b) &&
llabs(bam_ins_size(b) - sign*(aright-aleft+1))
> fd->tlen_approx)
|| (!bam_ins_size(b) && !fd->tlen_zero));
int tflag2 = ((p->tlen && llabs(p->tlen - -sign*(aright-aleft+1))
> fd->tlen_approx)
|| (!p->tlen && !fd->tlen_zero));
if (tflag1 || tflag2) {
if (CRAM_MAJOR_VERS(fd->version) >= 4) {
explicit_tlen = CRAM_FLAG_EXPLICIT_TLEN;
} else {
goto detached;
}
}
cr->mate_pos = p->apos;
cram_stats_add(c->stats[DS_NP], cr->mate_pos);
cr->tlen = explicit_tlen ? bam_ins_size(b) : sign*(aright-aleft+1);
cram_stats_add(c->stats[DS_TS], cr->tlen);
cr->mate_flags =
((p->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP +
((p->flags & BAM_FMREVERSE) == BAM_FMREVERSE) * CRAM_M_REVERSE;
if (p->cram_flags & CRAM_FLAG_STATS_ADDED) {
cram_stats_del(c->stats[DS_NP], p->mate_pos);
cram_stats_del(c->stats[DS_MF], p->mate_flags);
if (!(p->cram_flags & CRAM_FLAG_EXPLICIT_TLEN))
cram_stats_del(c->stats[DS_TS], p->tlen);
cram_stats_del(c->stats[DS_NS], p->mate_ref_id);
}
cr->cram_flags &= ~CRAM_FLAG_DETACHED;
cr->cram_flags |= explicit_tlen;
if (cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK) < 0)
goto block_err;
if (p->cram_flags & CRAM_FLAG_STATS_ADDED) {
cram_stats_del(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK);
p->cram_flags &= ~CRAM_FLAG_STATS_ADDED;
}
p->cram_flags &= ~CRAM_FLAG_DETACHED;
p->cram_flags |= CRAM_FLAG_MATE_DOWNSTREAM | explicit_tlen;;
if (cram_stats_add(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK) < 0)
goto block_err;
p->mate_line = rnum - (kh_val(s->pair[sec], k) + 1);
if (cram_stats_add(c->stats[DS_NF], p->mate_line) < 0)
goto block_err;
kh_val(s->pair[sec], k) = rnum;
} else {
detached:
cr->mate_flags = 0;
if (bam_flag(b) & BAM_FMUNMAP)
cr->mate_flags |= CRAM_M_UNMAP;
if (bam_flag(b) & BAM_FMREVERSE)
cr->mate_flags |= CRAM_M_REVERSE;
if (cram_stats_add(c->stats[DS_MF], cr->mate_flags) < 0)
goto block_err;
cr->mate_pos = MAX(bam_mate_pos(b)+1, 0);
if (cram_stats_add(c->stats[DS_NP], cr->mate_pos) < 0)
goto block_err;
cr->tlen = bam_ins_size(b);
if (cram_stats_add(c->stats[DS_TS], cr->tlen) < 0)
goto block_err;
cr->cram_flags |= CRAM_FLAG_DETACHED;
if (cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK) < 0)
goto block_err;
if (cram_stats_add(c->stats[DS_NS], bam_mate_ref(b)) < 0)
goto block_err;
cr->cram_flags |= CRAM_FLAG_STATS_ADDED;
}
}
cr->mqual = bam_map_qual(b);
if (cram_stats_add(c->stats[DS_MQ], cr->mqual) < 0)
goto block_err;
cr->mate_ref_id = bam_mate_ref(b);
if (!(bam_flag(b) & BAM_FUNMAP)) {
if (c->first_base > cr->apos)
c->first_base = cr->apos;
if (c->last_base < cr->aend)
c->last_base = cr->aend;
}
return 0;
block_err:
err:
return -1;
}
int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) {
cram_container *c;
if (!fd->ctr) {
fd->ctr = cram_new_container(fd->seqs_per_slice,
fd->slices_per_container);
if (!fd->ctr)
return -1;
fd->ctr->record_counter = fd->record_counter;
pthread_mutex_lock(&fd->ref_lock);
fd->ctr->no_ref = fd->no_ref;
fd->ctr->embed_ref = fd->embed_ref;
pthread_mutex_unlock(&fd->ref_lock);
}
c = fd->ctr;
int embed_ref = c->embed_ref;
if (!c->slice || c->curr_rec == c->max_rec ||
(bam_ref(b) != c->curr_ref && c->curr_ref >= -1) ||
(c->s_num_bases + c->s_aux_bytes >= fd->bases_per_slice)) {
int slice_rec, curr_rec, multi_seq = fd->multi_seq == 1;
int curr_ref = c->slice ? c->curr_ref : bam_ref(b);
if (fd->multi_seq == -1 && c->curr_rec < c->max_rec/4+10 &&
fd->last_slice && fd->last_slice < c->max_rec/4+10 &&
embed_ref<=0) {
if (!c->multi_seq)
hts_log_info("Multi-ref enabled for next container");
multi_seq = 1;
} else if (fd->multi_seq == 1) {
pthread_mutex_lock(&fd->metrics_lock);
if (fd->last_RI_count <= c->max_slice && fd->multi_seq_user != 1) {
multi_seq = 0;
hts_log_info("Multi-ref disabled for next container");
}
pthread_mutex_unlock(&fd->metrics_lock);
}
slice_rec = c->slice_rec;
curr_rec = c->curr_rec;
if (CRAM_MAJOR_VERS(fd->version) == 1 ||
c->curr_rec == c->max_rec || fd->multi_seq != 1 || !c->slice ||
c->s_num_bases + c->s_aux_bytes >= fd->bases_per_slice) {
if (NULL == (c = cram_next_container(fd, b))) {
if (fd->ctr) {
fd->ctr_mt = fd->ctr; fd->ctr = NULL;
}
return -1;
}
}
if (multi_seq == 0 && fd->multi_seq == 1 && fd->multi_seq_user == -1) {
fd->multi_seq = -1;
} else if (multi_seq) {
fd->multi_seq = 1;
c->multi_seq = 1;
c->pos_sorted = 0;
pthread_mutex_lock(&fd->ref_lock);
if (fd->embed_ref > 0 && c->curr_rec == 0 && c->curr_slice == 0) {
hts_log_warning("Changing from embed_ref to no_ref mode");
c->embed_ref = fd->embed_ref = 0; c->no_ref = fd->no_ref = 1;
}
pthread_mutex_unlock(&fd->ref_lock);
if (!c->refs_used) {
pthread_mutex_lock(&fd->ref_lock);
c->refs_used = calloc(fd->refs->nref, sizeof(int));
pthread_mutex_unlock(&fd->ref_lock);
if (!c->refs_used)
return -1;
}
}
fd->last_slice = curr_rec - slice_rec;
c->slice_rec = c->curr_rec;
if (bam_ref(b) >= 0 && curr_ref >= 0 && bam_ref(b) != curr_ref &&
embed_ref<=0 && !fd->unsorted && multi_seq) {
if (!c->refs_used) {
pthread_mutex_lock(&fd->ref_lock);
c->refs_used = calloc(fd->refs->nref, sizeof(int));
pthread_mutex_unlock(&fd->ref_lock);
if (!c->refs_used)
return -1;
} else if (c->refs_used && c->refs_used[bam_ref(b)]) {
pthread_mutex_lock(&fd->ref_lock);
fd->unsorted = 1;
fd->multi_seq = 1;
pthread_mutex_unlock(&fd->ref_lock);
}
}
c->curr_ref = bam_ref(b);
if (c->refs_used && c->curr_ref >= 0) c->refs_used[c->curr_ref]++;
}
if (!c->bams) {
pthread_mutex_lock(&fd->bam_list_lock);
if (fd->bl) {
spare_bams *spare = fd->bl;
c->bams = spare->bams;
fd->bl = spare->next;
free(spare);
} else {
c->bams = calloc(c->max_c_rec, sizeof(bam_seq_t *));
if (!c->bams) {
pthread_mutex_unlock(&fd->bam_list_lock);
return -1;
}
}
pthread_mutex_unlock(&fd->bam_list_lock);
}
if (c->bams[c->curr_c_rec]) {
if (bam_copy1(c->bams[c->curr_c_rec], b) == NULL)
return -1;
} else {
c->bams[c->curr_c_rec] = bam_dup1(b);
if (c->bams[c->curr_c_rec] == NULL)
return -1;
}
if (bam_seq_len(b)) {
c->s_num_bases += bam_seq_len(b);
} else {
hts_pos_t qlen = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b));
if (qlen > 100000000) {
hts_log_error("CIGAR query length %"PRIhts_pos
" for read \"%s\" is too long",
qlen, bam_get_qname(b));
return -1;
}
c->s_num_bases += qlen;
}
c->curr_rec++;
c->curr_c_rec++;
c->s_aux_bytes += bam_get_l_aux(b);
c->n_mapped += (bam_flag(b) & BAM_FUNMAP) ? 0 : 1;
fd->record_counter++;
return 0;
}