use crate::utils::DEFAULT_FLANK_SIZE;
use clap::Parser;
use log::info;
use rust_htslib::bam::{IndexedReader, Read};
use std::collections::HashSet;
use std::fs::{read_to_string, File};
use std::io::{BufWriter, Write};
use std::path::Path;
#[derive(Parser, Debug)]
pub struct ExtractBamArgs {
pub regions_file: String,
pub bam_file: String,
#[clap(short, long, default_value_t = DEFAULT_FLANK_SIZE)]
pub flank_size: i32,
}
pub fn extract_bam(region_file: &str, bam_file: &str, flank_size: i32) {
info!("Extracting reads from BAM file");
let regions = read_to_string(region_file).expect("valid region file");
let regions: Vec<&str> = regions.lines().collect();
let mut bam = IndexedReader::from_path(bam_file).expect("valid BAM file");
let output_file = Path::new(region_file).with_extension("fasta");
let mut writer = BufWriter::new(File::create(&output_file).unwrap());
let mut seen = HashSet::new();
for row in regions {
let field = row.split('\t').next().expect("valid region");
let region = field.split(':').collect::<Vec<&str>>();
let chrom = region[0];
let region = region[1].split('-').collect::<Vec<&str>>();
let start = region[0].parse::<i32>().expect("valid start position");
let end = region[1].parse::<i32>().expect("valid end position");
let start = (start - flank_size).max(0);
let end = end + flank_size;
bam.fetch((chrom, start, end)).expect("valid region");
for record in bam.records() {
let record = record.expect("valid record");
if record.is_unmapped()
|| record.is_secondary()
|| record.is_quality_check_failed()
|| record.is_duplicate()
|| record.is_supplementary()
{
continue;
}
let id = String::from_utf8(record.qname().to_vec()).expect("valid read ID");
if seen.contains(&id) {
info!("Skipping duplicate read {}", id);
continue;
}
seen.insert(id.clone());
let seq = record.seq();
writeln!(
writer,
">{}\n{}",
id,
String::from_utf8(seq.as_bytes()).unwrap()
)
.expect("write to FASTA");
}
}
info!("Extracted reads written to `{}`", output_file.display());
}