qbix 0.0.2

Random access to BAM records by read name using a compact .qbi index
use crate::error::Result;
use crate::hts::{BamRecord, HtsFile};
use crate::index::{generate_index_filename, qname_hash64, BamMetadata, Index};

const BGZF_CACHE_SIZE: usize = 64 * 1024 * 1024;

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub(crate) enum GetOrder {
    Query,
    Bam,
}

struct Hit<'a> {
    readname: &'a str,
    file_offset: i64,
}

pub(crate) fn build_index(
    input_bam: &str,
    output_index: Option<&str>,
    verbose: bool,
    threads: usize,
) -> Result<()> {
    let bam =
        HtsFile::open(input_bam, "r").map_err(|_| format!("[qbix] could not open {input_bam}"))?;
    bam.set_threads(threads)?;
    let header = bam
        .read_header()
        .map_err(|_| format!("[qbix] could not read BAM header from {input_bam}"))?;
    let bam_metadata = BamMetadata::from_bam(input_bam, header.text_hash()?)?;
    let rec = BamRecord::new()?;
    let mut index = Index::new();
    let mut file_offset = bam
        .tell()
        .map_err(|_| format!("[qbix] {input_bam} is not a BGZF-compressed BAM file"))?;

    loop {
        let ret = bam.read_next(&header, &rec);
        if ret < -1 {
            return Err(format!(
                "[qbix] error while reading BAM records from {input_bam}"
            ));
        }
        if ret < 0 {
            break;
        }

        let readname = rec.qname()?;
        index.add(readname, file_offset)?;
        if verbose && (index.record_count() == 1 || index.record_count().is_multiple_of(100_000)) {
            let last = index.last_record()?.expect("record was just added");
            eprintln!(
                "[qbix-build] record {} [{} {}] {}",
                index.record_count(),
                last.qhash,
                last.file_offset,
                readname
            );
        }
        file_offset = bam.tell()?;
    }

    if verbose {
        eprintln!("[qbix-build] writing to disk...");
    }
    let out_fn = generate_index_filename(Some(input_bam), output_index)?;
    index.save(&out_fn, bam_metadata)?;
    if verbose {
        eprintln!(
            "[qbix-build] wrote index for {} records.",
            index.record_count()
        );
    }
    Ok(())
}

pub(crate) fn get_records(
    input_bam: &str,
    input_index: Option<&str>,
    readnames: &[String],
    threads: usize,
    order: GetOrder,
) -> Result<()> {
    let bam = HtsFile::open(input_bam, "r")
        .map_err(|_| format!("[qbix] could not open BAM file: {input_bam}"))?;
    bam.set_threads(threads)?;
    bam.set_bgzf_cache_size(BGZF_CACHE_SIZE)?;
    let header = bam
        .read_header()
        .map_err(|_| format!("[qbix] could not read BAM header: {input_bam}"))?;
    let bam_metadata = BamMetadata::from_bam(input_bam, header.text_hash()?)?;
    let index = Index::load(Some(input_bam), input_index, Some(bam_metadata))?;
    let out = HtsFile::open("-", "w")
        .map_err(|_| "[qbix] could not open stdout for SAM output".to_string())?;
    let rec = BamRecord::new()?;

    let mut hits = Vec::new();
    for readname in readnames {
        for idx in index.range_indices(readname)? {
            let record = index.record(idx)?;
            hits.push(Hit {
                readname,
                file_offset: record.file_offset,
            });
        }
    }
    if order == GetOrder::Bam {
        hits.sort_by_key(|hit| hit.file_offset);
    }

    for hit in hits {
        bam.read_record_at(&header, &rec, hit.file_offset)?;
        if rec.qname()? == hit.readname {
            out.write_record(&header, &rec)?;
        }
    }
    Ok(())
}

pub(crate) fn show_index(input_index: &str) -> Result<()> {
    let index = Index::load(None, Some(input_index), None)?;
    for idx in 0..index.record_count() {
        let record = index.record(idx)?;
        println!("{}\t{}", record.qhash, record.file_offset);
    }
    Ok(())
}

pub(crate) fn test_index(
    input_bam: &str,
    input_index: Option<&str>,
    threads: usize,
    verbose: bool,
) -> Result<()> {
    let bam = HtsFile::open(input_bam, "r")
        .map_err(|_| "[qbix-test] could not open BAM file".to_string())?;
    bam.set_threads(threads)
        .map_err(|e| e.replace("[qbix]", "[qbix-test]"))?;
    bam.set_bgzf_cache_size(BGZF_CACHE_SIZE)
        .map_err(|e| e.replace("[qbix]", "[qbix-test]"))?;
    let header = bam
        .read_header()
        .map_err(|_| "[qbix-test] could not read BAM header".to_string())?;
    let bam_metadata = BamMetadata::from_bam(input_bam, header.text_hash()?)
        .map_err(|e| e.replace("[qbix]", "[qbix-test]"))?;
    let index = Index::load(Some(input_bam), input_index, Some(bam_metadata))
        .map_err(|e| e.replace("[qbix]", "[qbix-test]"))?;
    let rec =
        BamRecord::new().map_err(|_| "[qbix-test] could not allocate BAM record".to_string())?;

    let mut checked = 0usize;
    for idx in 0..index.record_count() {
        let record = index.record(idx)?;
        bam.read_record_at(&header, &rec, record.file_offset)
            .map_err(|e| e.replace("[qbix]", "[qbix-test]"))?;
        let got = rec.qname()?;
        let got_hash = qname_hash64(got.as_bytes());
        if verbose {
            eprintln!("[qbix-test] {} {}", record.qhash, got_hash);
        }
        if got_hash != record.qhash {
            return Err(
                "[qbix-test] lookup returned a record with the wrong read name hash".to_string(),
            );
        }
        checked += 1;
        if !verbose && checked.is_multiple_of(1_000_000) {
            eprintln!("[qbix-test] checked {checked} records...");
        }
    }
    Ok(())
}