#include <config.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <strings.h>
#include <getopt.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <errno.h>
#include "htslib/tbx.h"
#include "htslib/sam.h"
#include "htslib/vcf.h"
#include "htslib/kseq.h"
#include "htslib/bgzf.h"
#include "htslib/hts.h"
#include "htslib/regidx.h"
typedef struct
{
char *regions_fname, *targets_fname;
int print_header, header_only;
}
args_t;
static void error(const char *format, ...)
{
va_list ap;
va_start(ap, format);
vfprintf(stderr, format, ap);
va_end(ap);
exit(EXIT_FAILURE);
}
#define IS_GFF (1<<0)
#define IS_BED (1<<1)
#define IS_SAM (1<<2)
#define IS_VCF (1<<3)
#define IS_BCF (1<<4)
#define IS_BAM (1<<5)
#define IS_CRAM (1<<6)
#define IS_TXT (IS_GFF|IS_BED|IS_SAM|IS_VCF)
int file_type(const char *fname)
{
int l = strlen(fname);
if (l>=7 && strcasecmp(fname+l-7, ".gff.gz") == 0) return IS_GFF;
else if (l>=7 && strcasecmp(fname+l-7, ".bed.gz") == 0) return IS_BED;
else if (l>=7 && strcasecmp(fname+l-7, ".sam.gz") == 0) return IS_SAM;
else if (l>=7 && strcasecmp(fname+l-7, ".vcf.gz") == 0) return IS_VCF;
else if (l>=4 && strcasecmp(fname+l-4, ".bcf") == 0) return IS_BCF;
else if (l>=4 && strcasecmp(fname+l-4, ".bam") == 0) return IS_BAM;
else if (l>=4 && strcasecmp(fname+l-5, ".cram") == 0) return IS_CRAM;
htsFile *fp = hts_open(fname,"r");
enum htsExactFormat format = fp->format.format;
hts_close(fp);
if ( format == bcf ) return IS_BCF;
if ( format == bam ) return IS_BAM;
if ( format == cram ) return IS_CRAM;
if ( format == vcf ) return IS_VCF;
return 0;
}
static char **parse_regions(char *regions_fname, char **argv, int argc, int *nregs)
{
kstring_t str = {0,0,0};
int iseq = 0, ireg = 0;
char **regs = NULL;
*nregs = argc;
if ( regions_fname )
{
regidx_t *idx = regidx_init(regions_fname, NULL, NULL, 0, NULL);
if ( !idx ) error("Could not read %s\n", regions_fname);
(*nregs) += regidx_nregs(idx);
regs = (char**) malloc(sizeof(char*)*(*nregs));
int nseq;
char **seqs = regidx_seq_names(idx, &nseq);
for (iseq=0; iseq<nseq; iseq++)
{
regitr_t itr;
regidx_overlap(idx, seqs[iseq], 0, UINT32_MAX, &itr);
while ( itr.i < itr.n )
{
str.l = 0;
ksprintf(&str, "%s:%d-%d", seqs[iseq], REGITR_START(itr)+1, REGITR_END(itr)+1);
regs[ireg++] = strdup(str.s);
itr.i++;
}
}
regidx_destroy(idx);
}
free(str.s);
if ( !ireg )
{
if ( argc )
regs = (char**) malloc(sizeof(char*)*argc);
else
{
regs = (char**) malloc(sizeof(char*));
regs[0] = strdup(".");
*nregs = 1;
}
}
for (iseq=0; iseq<argc; iseq++) regs[ireg++] = strdup(argv[iseq]);
return regs;
}
static int query_regions(args_t *args, char *fname, char **regs, int nregs)
{
int i;
htsFile *fp = hts_open(fname,"r");
if ( !fp ) error("Could not read %s\n", fname);
enum htsExactFormat format = hts_get_format(fp)->format;
regidx_t *reg_idx = NULL;
if ( args->targets_fname )
{
reg_idx = regidx_init(args->targets_fname, NULL, NULL, 0, NULL);
if ( !reg_idx ) error("Could not read %s\n", args->targets_fname);
}
if ( format == bcf )
{
htsFile *out = hts_open("-","w");
if ( !out ) error("Could not open stdout\n", fname);
hts_idx_t *idx = bcf_index_load(fname);
if ( !idx ) error("Could not load .csi index of %s\n", fname);
bcf_hdr_t *hdr = bcf_hdr_read(fp);
if ( !hdr ) error("Could not read the header: %s\n", fname);
if ( args->print_header )
bcf_hdr_write(out,hdr);
if ( !args->header_only )
{
bcf1_t *rec = bcf_init();
for (i=0; i<nregs; i++)
{
hts_itr_t *itr = bcf_itr_querys(idx,hdr,regs[i]);
while ( bcf_itr_next(fp, itr, rec) >=0 )
{
if ( reg_idx && !regidx_overlap(reg_idx, bcf_seqname(hdr,rec),rec->pos,rec->pos+rec->rlen-1, NULL) ) continue;
bcf_write(out,hdr,rec);
}
tbx_itr_destroy(itr);
}
bcf_destroy(rec);
}
if ( hts_close(out) ) error("hts_close returned non-zero status for stdout\n");
bcf_hdr_destroy(hdr);
hts_idx_destroy(idx);
}
else if ( format==vcf || format==sam || format==unknown_format )
{
tbx_t *tbx = tbx_index_load(fname);
if ( !tbx ) error("Could not load .tbi/.csi index of %s\n", fname);
kstring_t str = {0,0,0};
if ( args->print_header )
{
while ( hts_getline(fp, KS_SEP_LINE, &str) >= 0 )
{
if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break;
puts(str.s);
}
}
if ( !args->header_only )
{
int nseq;
const char **seq = NULL;
if ( reg_idx ) seq = tbx_seqnames(tbx, &nseq);
for (i=0; i<nregs; i++)
{
hts_itr_t *itr = tbx_itr_querys(tbx, regs[i]);
if ( !itr ) continue;
while (tbx_itr_next(fp, tbx, itr, &str) >= 0)
{
if ( reg_idx && !regidx_overlap(reg_idx,seq[itr->curr_tid],itr->curr_beg,itr->curr_end, NULL) ) continue;
puts(str.s);
}
tbx_itr_destroy(itr);
}
free(seq);
}
free(str.s);
tbx_destroy(tbx);
}
else if ( format==bam )
error("Please use \"samtools view\" for querying BAM files.\n");
if ( reg_idx ) regidx_destroy(reg_idx);
if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname);
for (i=0; i<nregs; i++) free(regs[i]);
free(regs);
return 0;
}
static int query_chroms(char *fname)
{
const char **seq;
int i, nseq, ftype = file_type(fname);
if ( ftype & IS_TXT || !ftype )
{
tbx_t *tbx = tbx_index_load(fname);
if ( !tbx ) error("Could not load .tbi index of %s\n", fname);
seq = tbx_seqnames(tbx, &nseq);
for (i=0; i<nseq; i++)
printf("%s\n", seq[i]);
free(seq);
tbx_destroy(tbx);
}
else if ( ftype==IS_BCF )
{
htsFile *fp = hts_open(fname,"r");
if ( !fp ) error("Could not read %s\n", fname);
bcf_hdr_t *hdr = bcf_hdr_read(fp);
if ( !hdr ) error("Could not read the header: %s\n", fname);
hts_close(fp);
hts_idx_t *idx = bcf_index_load(fname);
if ( !idx ) error("Could not load .csi index of %s\n", fname);
seq = bcf_index_seqnames(idx, hdr, &nseq);
for (i=0; i<nseq; i++)
printf("%s\n", seq[i]);
free(seq);
bcf_hdr_destroy(hdr);
hts_idx_destroy(idx);
}
else if ( ftype==IS_BAM ) error("BAM: todo\n");
return 0;
}
int reheader_file(const char *fname, const char *header, int ftype, tbx_conf_t *conf)
{
if ( ftype & IS_TXT || !ftype )
{
BGZF *fp = bgzf_open(fname,"r");
if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) return -1;
char *buffer = fp->uncompressed_block;
int skip_until = 0;
if ( buffer[0]==conf->meta_char )
{
skip_until = 1;
while (1)
{
if ( buffer[skip_until]=='\n' )
{
skip_until++;
if ( skip_until>=fp->block_length )
{
if ( bgzf_read_block(fp) != 0 || !fp->block_length ) error("FIXME: No body in the file: %s\n", fname);
skip_until = 0;
}
if ( buffer[skip_until]!=conf->meta_char ) break;
}
skip_until++;
if ( skip_until>=fp->block_length )
{
if (bgzf_read_block(fp) != 0 || !fp->block_length) error("FIXME: No body in the file: %s\n", fname);
skip_until = 0;
}
}
}
FILE *hdr = fopen(header,"r");
if ( !hdr ) error("%s: %s", header,strerror(errno));
const size_t page_size = 32768;
char *buf = malloc(page_size);
BGZF *bgzf_out = bgzf_open("-", "w");
ssize_t nread;
while ( (nread=fread(buf,1,page_size-1,hdr))>0 )
{
if ( nread<page_size-1 && buf[nread-1]!='\n' ) buf[nread++] = '\n';
if (bgzf_write(bgzf_out, buf, nread) < 0) error("Error: %d\n",bgzf_out->errcode);
}
if ( fclose(hdr) ) error("close failed: %s\n", header);
if ( fp->block_length - skip_until > 0 )
{
if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0) error("Error: %d\n",fp->errcode);
}
if (bgzf_flush(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode);
while (1)
{
nread = bgzf_raw_read(fp, buf, page_size);
if ( nread<=0 ) break;
int count = bgzf_raw_write(bgzf_out, buf, nread);
if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread);
}
if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode);
if (bgzf_close(fp) < 0) error("Error: %d\n",fp->errcode);
free(buf);
}
else
error("todo: reheader BCF, BAM\n"); return 0;
}
static int usage(void)
{
fprintf(stderr, "\n");
fprintf(stderr, "Version: %s\n", hts_version());
fprintf(stderr, "Usage: tabix [OPTIONS] [FILE] [REGION [...]]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Indexing Options:\n");
fprintf(stderr, " -0, --zero-based coordinates are zero-based\n");
fprintf(stderr, " -b, --begin INT column number for region start [4]\n");
fprintf(stderr, " -c, --comment CHAR skip comment lines starting with CHAR [null]\n");
fprintf(stderr, " -C, --csi generate CSI index for VCF (default is TBI)\n");
fprintf(stderr, " -e, --end INT column number for region end (if no end, set INT to -b) [5]\n");
fprintf(stderr, " -f, --force overwrite existing index without asking\n");
fprintf(stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n");
fprintf(stderr, " -p, --preset STR gff, bed, sam, vcf\n");
fprintf(stderr, " -s, --sequence INT column number for sequence names (suppressed by -p) [1]\n");
fprintf(stderr, " -S, --skip-lines INT skip first INT lines [0]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Querying and other options:\n");
fprintf(stderr, " -h, --print-header print also the header lines\n");
fprintf(stderr, " -H, --only-header print only the header lines\n");
fprintf(stderr, " -l, --list-chroms list chromosome names\n");
fprintf(stderr, " -r, --reheader FILE replace the header with the content of FILE\n");
fprintf(stderr, " -R, --regions FILE restrict to regions listed in the file\n");
fprintf(stderr, " -T, --targets FILE similar to -R but streams rather than index-jumps\n");
fprintf(stderr, "\n");
return 1;
}
int main(int argc, char *argv[])
{
int c, detect = 1, min_shift = 0, is_force = 0, list_chroms = 0, do_csi = 0;
tbx_conf_t conf = tbx_conf_gff;
char *reheader = NULL;
args_t args;
memset(&args,0,sizeof(args_t));
static const struct option loptions[] =
{
{"help", no_argument, NULL, 2},
{"regions", required_argument, NULL, 'R'},
{"targets", required_argument, NULL, 'T'},
{"csi", no_argument, NULL, 'C'},
{"zero-based", no_argument, NULL, '0'},
{"print-header", no_argument, NULL, 'h'},
{"only-header", no_argument, NULL, 'H'},
{"begin", required_argument, NULL, 'b'},
{"comment", required_argument, NULL, 'c'},
{"end", required_argument, NULL, 'e'},
{"force", no_argument, NULL, 'f'},
{"preset", required_argument, NULL, 'p'},
{"sequence", required_argument, NULL, 's'},
{"skip-lines", required_argument, NULL, 'S'},
{"list-chroms", no_argument, NULL, 'l'},
{"reheader", required_argument, NULL, 'r'},
{"version", no_argument, NULL, 1},
{NULL, 0, NULL, 0}
};
char *tmp;
while ((c = getopt_long(argc, argv, "hH?0b:c:e:fm:p:s:S:lr:CR:T:", loptions,NULL)) >= 0)
{
switch (c)
{
case 'R': args.regions_fname = optarg; break;
case 'T': args.targets_fname = optarg; break;
case 'C': do_csi = 1; break;
case 'r': reheader = optarg; break;
case 'h': args.print_header = 1; break;
case 'H': args.print_header = 1; args.header_only = 1; break;
case 'l': list_chroms = 1; break;
case '0': conf.preset |= TBX_UCSC; detect = 0; break;
case 'b':
conf.bc = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: -b %s\n", optarg);
detect = 0;
break;
case 'e':
conf.ec = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: -e %s\n", optarg);
detect = 0;
break;
case 'c': conf.meta_char = *optarg; detect = 0; break;
case 'f': is_force = 1; break;
case 'm':
min_shift = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: -m %s\n", optarg);
break;
case 'p':
detect = 0;
if (strcmp(optarg, "gff") == 0) conf = tbx_conf_gff;
else if (strcmp(optarg, "bed") == 0) conf = tbx_conf_bed;
else if (strcmp(optarg, "sam") == 0) conf = tbx_conf_sam;
else if (strcmp(optarg, "vcf") == 0) conf = tbx_conf_vcf;
else if (strcmp(optarg, "bcf") == 0) detect = 1; else if (strcmp(optarg, "bam") == 0) detect = 1; else error("The preset string not recognised: '%s'\n", optarg);
break;
case 's':
conf.sc = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: -s %s\n", optarg);
detect = 0;
break;
case 'S':
conf.line_skip = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: -S %s\n", optarg);
detect = 0;
break;
case 1:
printf(
"tabix (htslib) %s\n"
"Copyright (C) 2018 Genome Research Ltd.\n", hts_version());
return EXIT_SUCCESS;
default: return usage();
}
}
if ( optind==argc ) return usage();
if ( list_chroms )
return query_chroms(argv[optind]);
if ( argc > optind+1 || args.header_only || args.regions_fname || args.targets_fname )
{
int nregs = 0;
char **regs = NULL;
if ( !args.header_only )
regs = parse_regions(args.regions_fname, argv+optind+1, argc-optind-1, &nregs);
return query_regions(&args, argv[optind], regs, nregs);
}
char *fname = argv[optind];
int ftype = file_type(fname);
if ( detect ) {
if ( ftype==IS_GFF ) conf = tbx_conf_gff;
else if ( ftype==IS_BED ) conf = tbx_conf_bed;
else if ( ftype==IS_SAM ) conf = tbx_conf_sam;
else if ( ftype==IS_VCF )
{
conf = tbx_conf_vcf;
if ( !min_shift && do_csi ) min_shift = 14;
}
else if ( ftype==IS_BCF )
{
if ( !min_shift ) min_shift = 14;
}
else if ( ftype==IS_BAM )
{
if ( !min_shift ) min_shift = 14;
}
}
if ( do_csi )
{
if ( !min_shift ) min_shift = 14;
min_shift *= do_csi; }
if ( min_shift!=0 && !do_csi ) do_csi = 1;
if ( reheader )
return reheader_file(fname, reheader, ftype, &conf);
char *suffix = ".tbi";
if ( do_csi ) suffix = ".csi";
else if ( ftype==IS_BAM ) suffix = ".bai";
else if ( ftype==IS_CRAM ) suffix = ".crai";
char *idx_fname = calloc(strlen(fname) + 5, 1);
strcat(strcpy(idx_fname, fname), suffix);
struct stat stat_tbi, stat_file;
if ( !is_force && stat(idx_fname, &stat_tbi)==0 )
{
stat(fname, &stat_file);
if ( stat_file.st_mtime <= stat_tbi.st_mtime )
error("[tabix] the index file exists. Please use '-f' to overwrite.\n");
}
free(idx_fname);
if ( ftype==IS_CRAM )
{
if ( bam_index_build(fname, min_shift)!=0 ) error("bam_index_build failed: %s\n", fname);
return 0;
}
else if ( do_csi )
{
if ( ftype==IS_BCF )
{
if ( bcf_index_build(fname, min_shift)!=0 ) error("bcf_index_build failed: %s\n", fname);
return 0;
}
if ( ftype==IS_BAM )
{
if ( bam_index_build(fname, min_shift)!=0 ) error("bam_index_build failed: %s\n", fname);
return 0;
}
if ( tbx_index_build(fname, min_shift, &conf)!=0 ) error("tbx_index_build failed: %s\n", fname);
return 0;
}
else {
if ( tbx_index_build(fname, min_shift, &conf) ) error("tbx_index_build failed: %s\n", fname);
return 0;
}
}