use std::path::PathBuf;
use anyhow::Result;
use clap::Args;
use kuva::plot::LinePlot;
use kuva::render::layout::Layout;
use kuva::render::plots::Plot;
use noodles::sam::Header;
use noodles::sam::alignment::RecordBuf;
use riker_derive::MetricDocs;
use serde::{Deserialize, Serialize};
use crate::collector::Collector;
use crate::commands::command::Command;
use crate::commands::common::{InputOptions, OptionalReferenceOptions, OutputOptions};
use crate::counter::Counter;
use crate::metrics::write_tsv;
use crate::plotting::{FG_BLUE, FG_GREEN, FG_TEAL, PLOT_HEIGHT, PLOT_WIDTH, write_plot_pdf};
use crate::progress::ProgressLogger;
use crate::sam::alignment_reader::AlignmentReader;
use crate::sam::pair_orientation::{PairOrientation, get_pair_orientation};
use crate::sam::record_utils::derive_sample;
pub const METRICS_SUFFIX: &str = ".isize-metrics.txt";
pub const HISTOGRAM_SUFFIX: &str = ".isize-histogram.txt";
pub const PLOT_SUFFIX: &str = ".isize-histogram.pdf";
#[riker_derive::multi_options("isize", "Insert Size Options")]
#[derive(Args, Debug, Clone)]
#[command()]
pub struct IsizeOptions {
#[arg(long, default_value_t = false)]
pub include_duplicates: bool,
#[arg(long, default_value_t = IsizeOptions::DEFAULT_MIN_FRAC)]
pub min_frac: f64,
#[arg(long, default_value_t = IsizeOptions::DEFAULT_DEVIATIONS)]
pub deviations: f64,
}
impl IsizeOptions {
const DEFAULT_MIN_FRAC: f64 = 0.05;
const DEFAULT_DEVIATIONS: f64 = 10.0;
}
impl Default for IsizeOptions {
fn default() -> Self {
Self {
include_duplicates: false,
min_frac: Self::DEFAULT_MIN_FRAC,
deviations: Self::DEFAULT_DEVIATIONS,
}
}
}
#[derive(Args, Debug, Clone)]
#[command(
long_about,
after_long_help = "\
Examples:
riker isize -i input.bam -o out_prefix
riker isize -i input.bam -o out_prefix --include-duplicates"
)]
pub struct InsertSize {
#[command(flatten)]
pub input: InputOptions,
#[command(flatten)]
pub output: OutputOptions,
#[command(flatten)]
pub reference: OptionalReferenceOptions,
#[command(flatten)]
pub options: IsizeOptions,
}
impl Command for InsertSize {
fn execute(&self) -> Result<()> {
let (mut reader, header) =
AlignmentReader::new(&self.input.input, self.reference.reference.as_deref())?;
let mut collector =
InsertSizeCollector::new(&self.input.input, &self.output.output, &self.options);
collector.initialize(&header)?;
let mut progress = ProgressLogger::new("isize", "reads", 5_000_000);
for result in reader.record_bufs(&header) {
let record = result?;
progress.record_with(&record, &header);
collector.accept(&record, &header)?;
}
progress.finish();
collector.finish()
}
}
pub struct InsertSizeCollector {
input: PathBuf,
metrics_path: PathBuf,
histogram_path: PathBuf,
plot_path: PathBuf,
plot_title: String,
include_duplicates: bool,
min_frac: f64,
deviations: f64,
fr: Counter<u64>,
rf: Counter<u64>,
tandem: Counter<u64>,
}
impl InsertSizeCollector {
#[must_use]
pub fn new(input: &std::path::Path, prefix: &std::path::Path, options: &IsizeOptions) -> Self {
let metrics_path = super::command::output_path(prefix, METRICS_SUFFIX);
let histogram_path = super::command::output_path(prefix, HISTOGRAM_SUFFIX);
let plot_path = super::command::output_path(prefix, PLOT_SUFFIX);
Self {
input: input.to_path_buf(),
metrics_path,
histogram_path,
plot_path,
plot_title: String::new(),
include_duplicates: options.include_duplicates,
min_frac: options.min_frac,
deviations: options.deviations,
fr: Counter::new(),
rf: Counter::new(),
tandem: Counter::new(),
}
}
fn plot_histogram(&self) -> Result<()> {
let total = self.fr.total() + self.rf.total() + self.tandem.total();
let orientations: [(&str, &str, &Counter<u64>); 3] = [
("FR", FG_BLUE, &self.fr),
("RF", FG_GREEN, &self.rf),
("TANDEM", FG_TEAL, &self.tandem),
];
let mut plots: Vec<Plot> = Vec::new();
for (name, color, hist) in &orientations {
let count = hist.total();
if count == 0 || (total > 0 && below_min_frac(count, total, self.min_frac)) {
continue;
}
let (median, mad) = hist.median_and_mad();
let trim_max = median + self.deviations * mad;
let xy: Vec<(f64, f64)> = hist
.sorted()
.into_iter()
.take_while(|&(k, _)| k as f64 <= trim_max)
.map(|(x, y)| (x as f64, y as f64))
.collect();
plots.push(Plot::Line(
LinePlot::new()
.with_data(xy)
.with_color(*color)
.with_fill()
.with_fill_opacity(0.3)
.with_legend(*name),
));
}
if plots.is_empty() {
return Ok(());
}
let layout = Layout::auto_from_plots(&plots)
.with_width(PLOT_WIDTH)
.with_height(PLOT_HEIGHT)
.with_title(&self.plot_title)
.with_x_label("Insert Size (bp)")
.with_y_label("Read Pairs")
.with_minor_ticks(5)
.with_show_minor_grid(true);
write_plot_pdf(plots, layout, &self.plot_path)
}
}
impl Collector for InsertSizeCollector {
fn initialize(&mut self, header: &Header) -> Result<()> {
let label = derive_sample(&self.input, header);
self.plot_title = format!("Insert Size Distribution of {label}");
Ok(())
}
fn accept(&mut self, record: &RecordBuf, _header: &Header) -> Result<()> {
let flags = record.flags();
if flags.is_unmapped()
|| flags.is_mate_unmapped()
|| !flags.is_segmented()
|| flags.is_first_segment()
|| flags.is_secondary()
|| flags.is_supplementary()
|| flags.is_qc_fail()
|| (!self.include_duplicates && flags.is_duplicate())
|| record.template_length() == 0
|| record.reference_sequence_id() != record.mate_reference_sequence_id()
{
return Ok(());
}
let insert_size = u64::from(record.template_length().unsigned_abs());
let Some(orientation) = get_pair_orientation(record) else {
return Ok(());
};
match orientation {
PairOrientation::Fr => self.fr.count(insert_size),
PairOrientation::Rf => self.rf.count(insert_size),
PairOrientation::Tandem => self.tandem.count(insert_size),
}
Ok(())
}
fn finish(&mut self) -> Result<()> {
let total: u64 = self.fr.total() + self.rf.total() + self.tandem.total();
if total == 0 {
log::warn!("isize: no qualifying read pairs found; writing empty metrics file");
}
let orientations: [(&str, &Counter<u64>); 3] =
[("FR", &self.fr), ("RF", &self.rf), ("TANDEM", &self.tandem)];
let mut metrics: Vec<InsertSizeMetric> = Vec::new();
for (name, hist) in &orientations {
let count = hist.total();
if count == 0 {
continue;
}
if total > 0 && below_min_frac(count, total, self.min_frac) {
continue;
}
let (median, mad) = hist.median_and_mad();
let mode = hist.mode().unwrap_or(0);
let min_is = hist.min().expect("non-empty histogram");
let max_is = hist.max().expect("non-empty histogram");
let trim_max = median + self.deviations * mad;
let (mean, stddev) = hist.trimmed_mean_and_stddev(trim_max);
metrics.push(InsertSizeMetric {
pair_orientation: (*name).to_string(),
read_pairs: count,
mean_insert_size: mean,
standard_deviation: stddev,
median_insert_size: median,
median_absolute_deviation: mad,
mode_insert_size: mode,
min_insert_size: min_is,
max_insert_size: max_is,
});
}
write_tsv(&self.metrics_path, &metrics)?;
let all_keys: std::collections::BTreeSet<u64> =
self.fr.keys().chain(self.rf.keys()).chain(self.tandem.keys()).copied().collect();
let hist_entries: Vec<InsertSizeHistogramEntry> = all_keys
.into_iter()
.map(|k| InsertSizeHistogramEntry {
insert_size: k,
fr_count: self.fr.count_of(&k),
rf_count: self.rf.count_of(&k),
tandem_count: self.tandem.count_of(&k),
})
.collect();
write_tsv(&self.histogram_path, &hist_entries)?;
self.plot_histogram()?;
Ok(())
}
fn name(&self) -> &'static str {
"isize"
}
}
#[derive(Debug, Serialize, Deserialize, MetricDocs)]
pub struct InsertSizeMetric {
pub pair_orientation: String,
pub read_pairs: u64,
#[serde(serialize_with = "crate::metrics::serialize_f64_2dp")]
pub mean_insert_size: f64,
#[serde(serialize_with = "crate::metrics::serialize_f64_2dp")]
pub standard_deviation: f64,
#[serde(serialize_with = "crate::metrics::serialize_f64_2dp")]
pub median_insert_size: f64,
#[serde(serialize_with = "crate::metrics::serialize_f64_2dp")]
pub median_absolute_deviation: f64,
pub mode_insert_size: u64,
pub min_insert_size: u64,
pub max_insert_size: u64,
}
#[derive(Debug, Serialize, Deserialize, MetricDocs)]
pub struct InsertSizeHistogramEntry {
pub insert_size: u64,
pub fr_count: u64,
pub rf_count: u64,
pub tandem_count: u64,
}
pub(crate) fn below_min_frac(count: u64, total: u64, min_frac: f64) -> bool {
(count as f64) < (total as f64) * min_frac
}
#[cfg(test)]
#[allow(clippy::float_cmp)]
mod tests {
use super::*;
#[test]
fn test_below_min_frac_below() {
assert!(below_min_frac(4, 100, 0.05));
}
#[test]
fn test_below_min_frac_at_threshold() {
assert!(!below_min_frac(5, 100, 0.05));
}
#[test]
fn test_below_min_frac_above() {
assert!(!below_min_frac(10, 100, 0.05));
}
#[test]
fn test_below_min_frac_zero_total() {
assert!(!below_min_frac(0, 0, 0.05));
}
}