use rust_htslib::faidx;
use rust_htslib::{bam, bam::Read};
use std::collections::HashMap;
use std::fs::File;
use std::io::{prelude::*, BufReader};
use std::path::PathBuf;
use std::str::FromStr;
fn count_base(c: u8) -> char {
match c {
65 => 'A',
97 => 'A',
85 => 'T',
117 => 'T',
67 => 'C',
99 => 'C',
71 => 'G',
103 => 'G',
84 => 'T',
116 => 'T',
_ => 'N',
}
}
pub fn run(bam_path: PathBuf, fasta_path: PathBuf, region_path: PathBuf, win: usize, step: usize) {
let file = File::open(®ion_path).unwrap();
let reader = BufReader::new(file);
let dna_bases = &vec!['A', 'C', 'G', 'T'];
let mut gc_background: HashMap<usize, usize> = HashMap::new();
let mut gc_counter: HashMap<usize, usize> = HashMap::new();
let fa_reader = &faidx::Reader::from_path(&fasta_path).unwrap();
let mut bam = bam::IndexedReader::from_path(&bam_path).unwrap();
for line in reader.lines() {
match line {
Ok(line) => {
let mut region = line.split('\t');
let chrom = region.next().unwrap();
let start = 0;
let end = usize::from_str(region.next().unwrap()).unwrap();
eprintln!("{}:{}-{}", chrom, start, end);
let mut span_start = start;
let mut span_end = start + win;
let mut span_index = 0;
while span_start < end {
span_index += 1;
if span_index % 10000 == 0 {
eprintln!(
"Finished {}M regions on genome",
span_index as f64 / (1000000.0 / 100.0 / 1000.0)
);
}
let fa_string = fa_reader
.fetch_seq(&chrom, span_start as usize, (span_end - 1) as usize)
.unwrap();
let base_counter = dna_bases
.clone()
.into_iter()
.map(|b| fa_string.iter().filter(|&c| count_base(*c) == b).count())
.collect::<Vec<_>>();
let total = base_counter.iter().sum::<usize>();
let gc = (base_counter[1] + base_counter[2]) as f64 / total as f64;
let gci = (gc * 1000.0).round() as usize;
gc_background
.entry(gci)
.and_modify(|e| *e += 1)
.or_insert(1);
if total > 0 {
bam.fetch((chrom, span_start as i64, span_end as i64))
.unwrap();
let n = bam.records().count();
if n > 0 {
gc_counter.entry(gci).and_modify(|e| *e += n).or_insert(n);
}
}
span_start += step;
span_end = span_start + win;
}
}
Err(..) => {}
}
}
for (k, v) in gc_counter {
println!("{}\t{}\t{}", k as f64 / 1000.0, v, gc_background[&k]);
}
}