use crate::{
constants::{AUTOSOME_CONTIG_NAMES, OUT_SUFFIX_STATS},
distributions::{expected::EstimatedDistr, trim_distr},
extract::is_primary_read,
util::{SampleStats, Utf8String},
};
use clap::{arg, Parser};
use ndarray::{Array1, Axis};
use ndarray_ndimage::gaussian_filter1d;
use rand::{distributions::WeightedIndex, prelude::*};
use rand_xoshiro::Xoshiro256PlusPlus;
use rust_htslib::bam::{self, Read};
use std::{
io::Write,
path::{Path, PathBuf},
};
#[derive(Parser)]
pub struct StatsCommandArgs {
pub alignment: PathBuf,
#[arg(short = 'n', long, default_value_t = 100)]
pub num_regions: usize,
#[arg(short = 'l', long, default_value_t = 100000)]
pub region_length: u64,
#[arg(short = 'q', long, default_value_t = 60.0)]
pub min_depth_mapq: f64,
#[arg(short = 'm', long, default_value_t = 60)]
pub min_insert_mapq: u8,
#[arg(long = "dc", default_value_t = 0.99)]
pub depth_distr_cutoff: f64,
#[arg(long = "ds", default_value_t = 0.0)]
pub depth_distr_gaussian_sigma: f64,
#[arg(long = "dt", default_value_t = 3)]
pub depth_distr_gaussian_truncate: usize,
#[arg(long = "ic", default_value_t = 0.998)]
pub insert_size_distr_cutoff: f64,
#[arg(long = "is", default_value_t = 6.0)]
pub insert_size_distr_gaussian_sigma: f64,
#[arg(long = "it", default_value_t = 3)]
pub insert_size_distr_gaussian_truncate: usize,
#[arg(
short = 'i',
long,
value_delimiter = ',',
default_value = AUTOSOME_CONTIG_NAMES,
num_args = 1..,
)]
pub include_contigs: Vec<String>,
#[arg(short = 's', long, default_value_t = 0)]
pub seed: u64,
#[arg(long)]
pub no_plot: bool,
#[arg(short = '@', default_value_t = 0)]
pub htslib_read_threads: usize,
}
pub fn run_stats_command(args: &StatsCommandArgs, output_prefix: &Path) -> anyhow::Result<()> {
let mut rng = Xoshiro256PlusPlus::seed_from_u64(args.seed);
info!("Opening alignment file");
let mut aln_rdr = bam::IndexedReader::from_path(&args.alignment)?;
if args.htslib_read_threads > 0 {
aln_rdr.set_threads(args.htslib_read_threads)?;
}
let header = aln_rdr.header().clone();
let tids = args
.include_contigs
.iter()
.filter_map(|c| header.tid(c.as_bytes()))
.collect::<Vec<_>>();
let tid_lens = tids
.iter()
.map(|&tid| header.target_len(tid).unwrap())
.collect::<Vec<_>>();
info!(
"Found {}/{} of included contigs in the alignment file",
tids.len(),
args.include_contigs.len()
);
let regions = generate_random_regions(
&tids,
&tid_lens,
args.num_regions,
args.region_length,
&mut rng,
)?;
let mut read_length = 0;
let mut depth_distr = EstimatedDistr::new();
let mut insert_distr = EstimatedDistr::new();
info!("Estimating read depth distribution");
for (tid, start, end) in ®ions {
aln_rdr.fetch((*tid, *start, *end))?;
let mapq_sum: u64 = aln_rdr.records().map(|r| r.unwrap().mapq() as u64).sum();
let avg_mapq: f64 = mapq_sum as f64 / aln_rdr.records().count() as f64;
if avg_mapq >= args.min_depth_mapq {
aln_rdr.fetch((*tid, *start, *end))?;
let depths = aln_rdr
.pileup()
.map(|p| p.unwrap().depth())
.collect::<Vec<_>>();
depth_distr.add_many(&depths);
} else {
warn!(
"Region ({}, {}, {}) has average mapq below 60. Skipping...",
tid, start, end
);
}
}
info!("Estimating insert size distribution");
for (tid, start, end) in ®ions {
aln_rdr.fetch((*tid, *start, *end))?;
for record_result in aln_rdr.records() {
let record = record_result?;
let not_clipped = record.cigar().to_string() == format!("{}M", record.seq_len());
if is_primary_read(&record)
&& record.is_proper_pair()
&& not_clipped
&& record.mapq() >= args.min_insert_mapq
{
let insert_size = record.insert_size();
if insert_size == 0 {
warn!(
"Proper read pair ({}) has zero insert size",
record.qname().to_string()
);
}
insert_distr.add(insert_size.unsigned_abs() as u32);
}
read_length = read_length.max(record.seq().len());
}
}
if args.depth_distr_gaussian_sigma > 0.0 {
debug!("Smoothing depth distribution with Gaussian filter");
let depth_distr_arr = Array1::from_vec(depth_distr.into());
let depth_distr_arr = gaussian_filter1d(
&depth_distr_arr,
args.depth_distr_gaussian_sigma,
Axis(0),
0,
ndarray_ndimage::BorderMode::Nearest,
args.depth_distr_gaussian_truncate,
);
depth_distr = EstimatedDistr::from_vec(depth_distr_arr.to_vec());
}
if args.insert_size_distr_gaussian_sigma > 0.0 {
debug!("Smoothing insert size distribution with Gaussian filter");
let insert_distr_arr = Array1::from_vec(insert_distr.into());
let insert_distr_arr = gaussian_filter1d(
&insert_distr_arr,
args.insert_size_distr_gaussian_sigma,
Axis(0),
0,
ndarray_ndimage::BorderMode::Nearest,
args.insert_size_distr_gaussian_truncate,
);
insert_distr = EstimatedDistr::from_vec(insert_distr_arr.to_vec());
}
if args.depth_distr_cutoff < 1.0 {
debug!("Trimming depth distribution");
trim_distr(depth_distr.as_mut(), args.depth_distr_cutoff);
}
if args.insert_size_distr_cutoff < 1.0 {
debug!("Trimming insert size distribution");
trim_distr(insert_distr.as_mut(), args.insert_size_distr_cutoff);
}
depth_distr.norm();
insert_distr.norm();
let stats = SampleStats::new(
read_length as u64,
depth_distr.mean(),
depth_distr.std_dev(),
insert_distr.mean(),
insert_distr.std_dev(),
depth_distr.distr.clone(),
insert_distr.distr.clone(),
);
info!("Writing stats file");
let depth_out_path = output_prefix.with_extension(OUT_SUFFIX_STATS);
let mut depth_out_file = std::fs::File::create(depth_out_path)?;
depth_out_file.write_all(serde_json::to_string(&stats)?.as_bytes())?;
Ok(())
}
fn generate_random_regions(
tids: &[u32],
tid_lens: &[u64],
num_regions: usize,
region_len: u64,
rng: &mut Xoshiro256PlusPlus,
) -> anyhow::Result<Vec<(u32, i64, i64)>> {
let weight_idx = WeightedIndex::new(tid_lens)?;
let mut regions = Vec::new();
for _ in 0..num_regions {
let tid_idx = weight_idx.sample(rng);
let tid: u32 = tids[tid_idx];
let max_start = tid_lens[tid_idx] - region_len;
let start = rng.gen_range(0..max_start) as i64;
let end = start + region_len as i64;
regions.push((tid, start, end));
}
Ok(regions)
}