klassify 0.1.6

Classify chimeric reads based on unique kmer contents
Documentation
use crate::utils::DEFAULT_FLANK_SIZE;
use clap::Parser;
use log::info;
use rust_htslib::bam::{IndexedReader, Read};
use std::collections::HashSet;
use std::fs::{read_to_string, File};
use std::io::{BufWriter, Write};
use std::path::Path;

#[derive(Parser, Debug)]
pub struct ExtractBamArgs {
    /// Regions that contain the reads
    /// Format: "chr1:1-100", one region per line
    pub regions_file: String,
    /// BAM file to extract reads
    pub bam_file: String,
    /// Flank size to extract from the region
    #[clap(short, long, default_value_t = DEFAULT_FLANK_SIZE)]
    pub flank_size: i32,
}

/// Extract reads from a BAM file
pub fn extract_bam(region_file: &str, bam_file: &str, flank_size: i32) {
    info!("Extracting reads from BAM file");
    let regions = read_to_string(region_file).expect("valid region file");
    let regions: Vec<&str> = regions.lines().collect();
    let mut bam = IndexedReader::from_path(bam_file).expect("valid BAM file");
    let output_file = Path::new(region_file).with_extension("fasta");
    let mut writer = BufWriter::new(File::create(&output_file).unwrap());
    let mut seen = HashSet::new();
    for row in regions {
        // Get first tab-separated column as region
        let field = row.split('\t').next().expect("valid region");
        let region = field.split(':').collect::<Vec<&str>>();
        let chrom = region[0];
        let region = region[1].split('-').collect::<Vec<&str>>();
        let start = region[0].parse::<i32>().expect("valid start position");
        let end = region[1].parse::<i32>().expect("valid end position");
        let start = (start - flank_size).max(0);
        let end = end + flank_size;
        bam.fetch((chrom, start, end)).expect("valid region");
        for record in bam.records() {
            let record = record.expect("valid record");
            if record.is_unmapped()
                || record.is_secondary()
                || record.is_quality_check_failed()
                || record.is_duplicate()
                || record.is_supplementary()
            {
                continue;
            }
            let id = String::from_utf8(record.qname().to_vec()).expect("valid read ID");
            if seen.contains(&id) {
                info!("Skipping duplicate read {}", id);
                continue;
            }
            seen.insert(id.clone());
            let seq = record.seq();
            writeln!(
                writer,
                ">{}\n{}",
                id,
                String::from_utf8(seq.as_bytes()).unwrap()
            )
            .expect("write to FASTA");
        }
    }
    info!("Extracted reads written to `{}`", output_file.display());
}