use crate::commands::common::parse_bool;
use crate::logging::OperationTimer;
use crate::metrics::duplex::{DuplexMetricsCollector, DuplexYieldMetric};
use crate::simple_umi_consensus::SimpleUmiConsensusCaller;
use crate::umi::extract_mi_base;
use crate::validation::validate_file_exists;
use anyhow::{Context, Result};
use clap::Parser;
use fgoxide::io::DelimFile;
use log::info;
use statrs::distribution::{Binomial, DiscreteCDF};
use std::path::PathBuf;
use super::command::Command;
use super::shared_metrics::{
DOWNSAMPLING_FRACTIONS, TemplateInfo, TemplateMetadata, compute_template_metadata,
execute_r_script, is_r_available, parse_intervals, process_templates_from_bam,
validate_not_consensus_bam,
};
const R_SCRIPT: &str = include_str!("../../../resources/CollectDuplexSeqMetrics.R");
#[derive(Parser, Debug)]
#[command(
name = "duplex-metrics",
author,
version,
about = "\x1b[38;5;173m[POST-CONSENSUS]\x1b[0m \x1b[36mCollect QC metrics for duplex consensus reads\x1b[0m",
long_about = r#"
Collects a suite of metrics to QC duplex sequencing data.
## Inputs
The input to this tool must be a BAM file that is either:
1. The exact BAM output by the `group` tool (in the sort-order it was produced in)
2. A BAM file that has MI tags present on all reads (usually set by `group` and has been
sorted into template-coordinate order
Calculation of metrics may be restricted to a set of regions using the `--intervals` parameter.
This can significantly affect results as off-target reads in duplex sequencing experiments often
have very different properties than on-target reads due to the lack of enrichment.
Several metrics are calculated related to the fraction of tag families that have duplex coverage.
The definition of "duplex" is controlled by the `--min-ab-reads` and `--min-ba-reads` parameters.
The default is to treat any tag family with at least one observation of each strand as a duplex,
but this could be made more stringent, e.g. by setting `--min-ab-reads=3 --min-ba-reads=3`.
## Outputs
The following output files are produced:
1. **<output>.family_sizes.txt**: metrics on the frequency of different types of families of different sizes
2. **<output>.duplex_family_sizes.txt**: metrics on the frequency of duplex tag families by the number of
observations from each strand
3. **<output>.duplex_yield_metrics.txt**: summary QC metrics produced using 5%, 10%, 15%...100% of the data
4. **<output>.umi_counts.txt**: metrics on the frequency of observations of UMIs within reads and tag families
5. **<output>.duplex_umi_counts.txt**: (optional) metrics on the frequency of observations of duplex UMIs within
reads and tag families. This file is only produced _if_ the
`--duplex-umi-counts` option is used as it requires significantly more
memory to track all pairs of UMIs seen when a large number of UMI
sequences are present.
6. **<output>.duplex_qc.pdf**: (optional) a series of plots generated from the preceding metrics files for
visualization. This file is only produced if R is available with the required
packages (ggplot2 and scales). Use `--description` to customize plot titles.
Within the metrics files the prefixes `CS`, `SS` and `DS` are used to mean:
* **CS**: tag families where membership is defined solely on matching genome coordinates and strand
* **SS**: single-stranded tag families where membership is defined by genome coordinates, strand and UMI;
ie. 50/A and 50/B are considered different tag families
* **DS**: double-stranded tag families where membership is collapsed across single-stranded tag families
from the same double-stranded source molecule; i.e. 50/A and 50/B become one family
"#
)]
pub struct DuplexMetrics {
#[arg(short = 'i', long = "input")]
pub input: PathBuf,
#[arg(short = 'o', long = "output")]
pub output: PathBuf,
#[arg(long = "min-ab-reads", default_value = "1")]
pub min_ab_reads: usize,
#[arg(long = "min-ba-reads", default_value = "1")]
pub min_ba_reads: usize,
#[arg(long = "duplex-umi-counts", default_value = "false", num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set, value_parser = parse_bool)]
pub duplex_umi_counts: bool,
#[arg(short = 'l', long = "intervals")]
pub intervals: Option<PathBuf>,
#[arg(long = "description")]
pub description: Option<String>,
}
impl Command for DuplexMetrics {
fn execute(&self, _command_line: &str) -> Result<()> {
info!("DuplexMetrics");
info!(" Input: {}", self.input.display());
info!(" Output prefix: {}", self.output.display());
info!(" Min AB reads: {}", self.min_ab_reads);
info!(" Min BA reads: {}", self.min_ba_reads);
info!(" Collect duplex UMI counts: {}", self.duplex_umi_counts);
let timer = OperationTimer::new("Computing duplex metrics");
validate_file_exists(&self.input, "input BAM file")?;
if self.min_ab_reads < self.min_ba_reads {
anyhow::bail!(
"--min-ab-reads ({}) must be >= --min-ba-reads ({})",
self.min_ab_reads,
self.min_ba_reads
);
}
validate_not_consensus_bam(&self.input)?;
let intervals = if let Some(intervals_path) = &self.intervals {
info!(" Loading intervals from: {}", intervals_path.display());
let intervals = parse_intervals(intervals_path)?;
info!(" Loaded {} intervals", intervals.len());
intervals
} else {
Vec::new()
};
let fractions = &DOWNSAMPLING_FRACTIONS;
let mut collectors: Vec<DuplexMetricsCollector> = fractions
.iter()
.map(|&fraction| {
let collect_duplex = self.duplex_umi_counts && (fraction - 1.0_f64).abs() < 0.01;
DuplexMetricsCollector::new(collect_duplex)
})
.collect();
let mut umi_consensus_caller = SimpleUmiConsensusCaller::default();
info!("Processing templates in single pass at {} sampling fractions...", fractions.len());
let (total_template_count, fraction_template_counts) = process_templates_from_bam(
&self.input,
&intervals,
fractions.len(),
|group, fraction_counts| {
Self::process_coordinate_group(
group,
fractions,
&mut collectors,
&mut umi_consensus_caller,
fraction_counts,
self,
);
},
)?;
info!("Processed {total_template_count} templates");
let mut yield_metrics = Vec::new();
for ((&fraction, collector), &read_pairs) in
fractions.iter().zip(collectors.iter()).zip(fraction_template_counts.iter())
{
let yield_metric = self.generate_yield_metric(collector, fraction, read_pairs);
yield_metrics.push(yield_metric);
}
let main_collector =
collectors.pop().expect("collectors is non-empty (always includes 100% fraction)");
info!("Writing metrics...");
let family_size_metrics = main_collector.family_size_metrics();
let family_size_path = format!("{}.family_sizes.txt", self.output.display());
DelimFile::default()
.write_tsv(&family_size_path, family_size_metrics)
.with_context(|| format!("Failed to write family size metrics: {family_size_path}"))?;
info!("Wrote family size metrics to {family_size_path}");
let duplex_family_size_metrics = main_collector.duplex_family_size_metrics();
let duplex_family_size_path = format!("{}.duplex_family_sizes.txt", self.output.display());
DelimFile::default()
.write_tsv(&duplex_family_size_path, duplex_family_size_metrics)
.with_context(|| {
format!("Failed to write duplex family size metrics: {duplex_family_size_path}")
})?;
info!("Wrote duplex family size metrics to {duplex_family_size_path}");
let umi_metrics = main_collector.umi_metrics();
let umi_path = format!("{}.umi_counts.txt", self.output.display());
DelimFile::default()
.write_tsv(&umi_path, umi_metrics)
.with_context(|| format!("Failed to write UMI metrics: {umi_path}"))?;
info!("Wrote UMI metrics to {umi_path}");
if self.duplex_umi_counts {
let duplex_umi_metrics =
main_collector.duplex_umi_metrics(&main_collector.umi_metrics());
let duplex_umi_path = format!("{}.duplex_umi_counts.txt", self.output.display());
DelimFile::default().write_tsv(&duplex_umi_path, duplex_umi_metrics).with_context(
|| format!("Failed to write duplex UMI metrics: {duplex_umi_path}"),
)?;
info!("Wrote duplex UMI metrics to {duplex_umi_path}");
}
let yield_path = format!("{}.duplex_yield_metrics.txt", self.output.display());
DelimFile::default()
.write_tsv(&yield_path, yield_metrics)
.with_context(|| format!("Failed to write yield metrics: {yield_path}"))?;
info!("Wrote yield metrics to {yield_path}");
let pdf_path = format!("{}.duplex_qc.pdf", self.output.display());
if is_r_available() {
let description = self.description.as_deref().unwrap_or("Sample");
match execute_r_script(
R_SCRIPT,
&[
&family_size_path,
&duplex_family_size_path,
&yield_path,
&umi_path,
&pdf_path,
description,
],
"fgumi_CollectDuplexSeqMetrics.R",
) {
Ok(()) => info!("Generated PDF plots: {pdf_path}"),
Err(e) => {
log::warn!("Failed to generate PDF plots: {e}. Continuing without plots.");
log::warn!(
"To enable PDF generation, ensure R is installed with ggplot2 and scales packages:"
);
log::warn!(" install.packages(c(\"ggplot2\", \"scales\"))");
}
}
} else {
log::warn!(
"R or required packages (ggplot2, scales) not available. Skipping PDF generation."
);
log::warn!("To enable PDF generation, install R and required packages:");
log::warn!(" install.packages(c(\"ggplot2\", \"scales\"))");
}
info!("Done!");
timer.log_completion(total_template_count as u64);
Ok(())
}
}
impl DuplexMetrics {
fn process_coordinate_group(
group: &[TemplateInfo],
fractions: &[f64],
collectors: &mut [DuplexMetricsCollector],
umi_consensus_caller: &mut SimpleUmiConsensusCaller,
fraction_template_counts: &mut [usize],
metrics: &DuplexMetrics,
) {
use std::collections::HashMap;
if group.is_empty() {
return;
}
let metadata = compute_template_metadata(group);
for (idx, &fraction) in fractions.iter().enumerate() {
let downsampled: Vec<&TemplateMetadata> =
metadata.iter().filter(|m| m.template.hash_fraction <= fraction).collect();
if downsampled.is_empty() {
continue;
}
fraction_template_counts[idx] += downsampled.len();
collectors[idx].record_cs_family(downsampled.len());
let mut ss_groups: HashMap<&str, usize> = HashMap::new();
for m in &downsampled {
*ss_groups.entry(m.template.mi.as_str()).or_default() += 1;
}
for &ss_size in ss_groups.values() {
collectors[idx].record_ss_family(ss_size);
}
#[allow(clippy::type_complexity)]
let mut ds_groups: HashMap<&str, (usize, usize, Vec<(&str, &str)>)> = HashMap::new();
for m in &downsampled {
let entry = ds_groups.entry(m.base_umi).or_default();
if m.is_a_strand {
entry.0 += 1;
} else if m.is_b_strand {
entry.1 += 1;
}
entry.2.push((m.template.mi.as_str(), m.template.rx.as_str()));
}
let is_full_fraction = (fraction - 1.0_f64).abs() < 0.01;
for (base_umi, (a_count, b_count, mi_rx_pairs)) in &ds_groups {
let ds_size = a_count + b_count;
collectors[idx].record_ds_family(ds_size);
let (ab_count, ba_count) =
if a_count >= b_count { (*a_count, *b_count) } else { (*b_count, *a_count) };
collectors[idx].record_duplex_family(ab_count, ba_count);
if is_full_fraction {
let owned_pairs: Vec<(String, String)> = mi_rx_pairs
.iter()
.map(|(mi, rx)| ((*mi).to_string(), (*rx).to_string()))
.collect();
metrics.update_umi_metrics(
&mut collectors[idx],
umi_consensus_caller,
&owned_pairs,
base_umi,
*a_count,
*b_count,
);
}
}
}
}
fn generate_yield_metric(
&self,
collector: &DuplexMetricsCollector,
fraction: f64,
read_pairs: usize,
) -> DuplexYieldMetric {
let family_size_metrics = collector.family_size_metrics();
let ds_families_count: usize = family_size_metrics.iter().map(|m| m.ds_count).sum();
let duplex_family_size_metrics = collector.duplex_family_size_metrics();
let ds_duplexes_count: usize = duplex_family_size_metrics
.iter()
.filter(|m| m.ab_size >= self.min_ab_reads && m.ba_size >= self.min_ba_reads)
.map(|m| m.count)
.sum();
let ds_family_sizes: Vec<usize> =
family_size_metrics.iter().flat_map(|m| vec![m.family_size; m.ds_count]).collect();
let ideal_fraction = Self::calculate_ideal_duplex_fraction(
&ds_family_sizes,
self.min_ab_reads,
self.min_ba_reads,
);
let cs_families: usize = family_size_metrics.iter().map(|m| m.cs_count).sum();
let ss_families: usize = duplex_family_size_metrics
.iter()
.map(|m| {
let mut count = 0;
if m.ab_size > 0 {
count += 1;
}
if m.ba_size > 0 {
count += 1;
}
count * m.count
})
.sum();
DuplexYieldMetric {
fraction,
read_pairs,
cs_families,
ss_families,
ds_families: ds_families_count,
ds_duplexes: ds_duplexes_count,
ds_fraction_duplexes: if ds_families_count > 0 {
ds_duplexes_count as f64 / ds_families_count as f64
} else {
0.0
},
ds_fraction_duplexes_ideal: ideal_fraction,
}
}
fn calculate_ideal_duplex_fraction(
family_sizes: &[usize],
min_ab: usize,
min_ba: usize,
) -> f64 {
if family_sizes.is_empty() {
return 0.0;
}
let mut ideal_duplexes = 0.0;
let total_families = family_sizes.len() as f64;
for &size in family_sizes {
if size < min_ab + min_ba {
continue;
}
let binomial = match Binomial::new(0.5, size as u64) {
Ok(b) => b,
Err(_) => continue, };
let upper_bound = size - min_ba;
let lower_bound = min_ab;
let prob = if upper_bound >= lower_bound {
let p_upper = binomial.cdf(upper_bound as u64);
let p_lower =
if lower_bound > 0 { binomial.cdf((lower_bound - 1) as u64) } else { 0.0 };
p_upper - p_lower
} else {
0.0
};
ideal_duplexes += prob;
}
ideal_duplexes / total_families
}
fn update_umi_metrics(
&self,
collector: &mut DuplexMetricsCollector,
umi_consensus_caller: &mut SimpleUmiConsensusCaller,
group_pairs: &[(String, String)],
base_umi: &str,
_a_count: usize,
_b_count: usize,
) {
let mut umi1s = Vec::new();
let mut umi2s = Vec::new();
for (mi, rx) in group_pairs {
let mi_base = extract_mi_base(mi);
if mi_base != base_umi {
continue;
}
let parts: Vec<&str> = rx.split('-').collect();
if parts.len() != 2 {
continue;
}
if parts[0].is_empty() || parts[1].is_empty() {
continue;
}
if mi.ends_with("/A") {
umi1s.push(parts[0].to_string());
umi2s.push(parts[1].to_string());
} else if mi.ends_with("/B") {
umi1s.push(parts[1].to_string());
umi2s.push(parts[0].to_string());
}
}
let mut consensus_umis = Vec::new();
if !umi1s.is_empty() {
let (consensus, _had_errors) = umi_consensus_caller.consensus(&umi1s);
let raw_count = umi1s.len();
let error_count = umi1s.iter().filter(|u| **u != consensus).count();
collector.record_umi(&consensus, raw_count, error_count, true);
consensus_umis.push(consensus);
}
if !umi2s.is_empty() {
let (consensus, _had_errors) = umi_consensus_caller.consensus(&umi2s);
let raw_count = umi2s.len();
let error_count = umi2s.iter().filter(|u| **u != consensus).count();
collector.record_umi(&consensus, raw_count, error_count, true);
consensus_umis.push(consensus);
}
if self.duplex_umi_counts && consensus_umis.len() == 2 {
let duplex_umi = format!("{}-{}", consensus_umis[0], consensus_umis[1]);
let total_raw = umi1s.len();
let expected_duplex1 = format!("{}-{}", consensus_umis[0], consensus_umis[1]);
let expected_duplex2 = format!("{}-{}", consensus_umis[1], consensus_umis[0]);
let error_count = group_pairs
.iter()
.filter(|(mi, rx)| {
let mi_base = extract_mi_base(mi);
mi_base == base_umi && *rx != expected_duplex1 && *rx != expected_duplex2
})
.count();
collector.record_duplex_umi(&duplex_umi, total_raw, error_count, true);
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::commands::shared_metrics::{
Interval, TemplateInfo, compute_hash_fraction, overlaps_intervals, parse_intervals,
};
use crate::metrics::duplex::{DuplexFamilySizeMetric, DuplexUmiMetric, FamilySizeMetric};
use crate::metrics::shared::UmiMetric;
use anyhow::Result;
use fgoxide::io::DelimFile;
use fgumi_raw_bam::{
SamBuilder as RawSamBuilder, flags, raw_record_to_record_buf, testutil::encode_op,
};
use noodles::bam;
use noodles::sam;
use noodles::sam::alignment::io::Write;
use std::num::NonZeroUsize;
use tempfile::{NamedTempFile, TempDir};
fn create_test_header() -> sam::Header {
use noodles::sam::header::record::value::Map;
use noodles::sam::header::record::value::map::ReferenceSequence;
let mut builder = sam::Header::builder();
builder = builder.add_reference_sequence(
bstr::BString::from("chr1"),
Map::<ReferenceSequence>::new(
NonZeroUsize::new(248_956_422).expect("non-zero chromosome length"),
),
);
builder = builder.add_reference_sequence(
bstr::BString::from("chr2"),
Map::<ReferenceSequence>::new(
NonZeroUsize::new(242_193_529).expect("non-zero chromosome length"),
),
);
builder.build()
}
fn to_record_buf(raw: fgumi_raw_bam::RawRecord) -> sam::alignment::RecordBuf {
raw_record_to_record_buf(&raw, &sam::Header::default())
.expect("raw_record_to_record_buf failed in test")
}
#[allow(clippy::too_many_arguments, clippy::cast_sign_loss)]
fn build_test_pair(
name: &str,
ref_id: usize,
pos1: i32,
pos2: i32,
rx_umi: &str,
mi_tag: &str,
strand1_plus: bool,
strand2_plus: bool,
) -> (sam::alignment::RecordBuf, sam::alignment::RecordBuf) {
let seq = vec![b'A'; 100];
let quals = vec![30u8; 100];
let cigar = encode_op(0, 100);
let r1_flags = flags::PAIRED
| flags::FIRST_SEGMENT
| if strand1_plus { 0 } else { flags::REVERSE }
| if strand2_plus { 0 } else { flags::MATE_REVERSE };
let r2_flags = flags::PAIRED
| flags::LAST_SEGMENT
| if strand2_plus { 0 } else { flags::REVERSE }
| if strand1_plus { 0 } else { flags::MATE_REVERSE };
let mut b1 = RawSamBuilder::new();
b1.read_name(name.as_bytes())
.flags(r1_flags)
.ref_id(ref_id as i32)
.pos(pos1 - 1)
.mapq(60)
.cigar_ops(&[cigar])
.sequence(&seq)
.qualities(&quals)
.mate_ref_id(ref_id as i32)
.mate_pos(pos2 - 1);
b1.add_string_tag(b"RX", rx_umi.as_bytes());
b1.add_string_tag(b"MI", mi_tag.as_bytes());
let r1 = to_record_buf(b1.build());
let mut b2 = RawSamBuilder::new();
b2.read_name(name.as_bytes())
.flags(r2_flags)
.ref_id(ref_id as i32)
.pos(pos2 - 1)
.mapq(60)
.cigar_ops(&[cigar])
.sequence(&seq)
.qualities(&quals)
.mate_ref_id(ref_id as i32)
.mate_pos(pos1 - 1);
b2.add_string_tag(b"RX", rx_umi.as_bytes());
b2.add_string_tag(b"MI", mi_tag.as_bytes());
let r2 = to_record_buf(b2.build());
(r1, r2)
}
fn create_test_bam(records: Vec<sam::alignment::RecordBuf>) -> Result<NamedTempFile> {
let temp_file = NamedTempFile::new()?;
let header = create_test_header();
let mut writer = bam::io::writer::Builder.build_from_path(temp_file.path())?;
writer.write_header(&header)?;
for record in records {
writer.write_alignment_record(&header, &record)?;
}
drop(writer);
Ok(temp_file)
}
#[test]
fn test_count_umis_once_per_read_pair() -> Result<()> {
let mut records = Vec::new();
let (r1, r2) = build_test_pair("q1", 0, 100, 200, "AAA-TTT", "AAA-TTT/A", true, false);
records.push(r1);
records.push(r2);
let (r1, r2) = build_test_pair("q2", 0, 200, 100, "TTT-AAA", "AAA-TTT/B", false, true);
records.push(r1);
records.push(r2);
let input = create_test_bam(records)?;
let output_dir = TempDir::new()?;
let output = output_dir.path().join("output");
let cmd = DuplexMetrics {
input: input.path().to_path_buf(),
output: output.clone(),
min_ab_reads: 1,
min_ba_reads: 1,
duplex_umi_counts: false,
intervals: None,
description: None,
};
cmd.execute("test")?;
let umi_path = format!("{}.umi_counts.txt", output.display());
let umi_metrics: Vec<UmiMetric> = DelimFile::default().read_tsv(&umi_path)?;
assert_eq!(umi_metrics.len(), 2, "Should have 2 UMI entries");
for metric in &umi_metrics {
assert_eq!(metric.raw_observations, 2, "Each UMI should have 2 raw observations");
assert_eq!(
metric.unique_observations, 1,
"Each UMI should have 1 unique observation (one DS family)"
);
}
let duplex_umi_path = format!("{}.duplex_umi_counts.txt", output.display());
assert!(!std::path::Path::new(&duplex_umi_path).exists());
Ok(())
}
#[test]
fn test_error_correct_umis_for_counting() -> Result<()> {
let mut records = Vec::new();
for i in 1..=3 {
let umi = if i == 3 { "CAA-TTT" } else { "AAA-TTT" };
let (r1, r2) = build_test_pair(&format!("a{i}"), 0, 100, 200, umi, "1/A", true, false);
records.push(r1);
records.push(r2);
}
for i in 1..=3 {
let umi = if i == 3 { "CTT-AAA" } else { "TTT-AAA" };
let (r1, r2) = build_test_pair(&format!("b{i}"), 0, 200, 100, umi, "1/B", false, true);
records.push(r1);
records.push(r2);
}
let input = create_test_bam(records)?;
let output_dir = TempDir::new()?;
let output = output_dir.path().join("output");
let cmd = DuplexMetrics {
input: input.path().to_path_buf(),
output: output.clone(),
min_ab_reads: 1,
min_ba_reads: 1,
duplex_umi_counts: true,
intervals: None,
description: None,
};
cmd.execute("test")?;
let umi_path = format!("{}.umi_counts.txt", output.display());
let umi_metrics: Vec<UmiMetric> = DelimFile::default().read_tsv(&umi_path)?;
assert_eq!(umi_metrics.len(), 2, "Should have 2 UMIs after error correction");
for metric in &umi_metrics {
assert!(metric.umi == "AAA" || metric.umi == "TTT", "UMI should be AAA or TTT");
assert_eq!(metric.raw_observations, 6, "Each UMI should have 6 total raw observations");
assert_eq!(
metric.raw_observations_with_errors, 1,
"Each UMI should have 1 observation with error"
);
assert_eq!(metric.unique_observations, 1, "Each UMI should form 1 unique DS family");
}
let duplex_umi_path = format!("{}.duplex_umi_counts.txt", output.display());
let duplex_umi_metrics: Vec<DuplexUmiMetric> =
DelimFile::default().read_tsv(&duplex_umi_path)?;
assert_eq!(duplex_umi_metrics.len(), 1, "Should have 1 duplex UMI");
let duplex_metric = &duplex_umi_metrics[0];
assert_eq!(duplex_metric.umi, "AAA-TTT");
assert_eq!(duplex_metric.raw_observations, 6);
assert_eq!(duplex_metric.unique_observations, 1);
Ok(())
}
#[test]
fn test_count_unique_umi_observations_with_tag_families() -> Result<()> {
let mut records = Vec::new();
let (r1, r2) = build_test_pair("q1", 0, 100, 200, "AAA-TTT", "1/A", true, false);
records.push(r1);
records.push(r2);
let (r1, r2) = build_test_pair("q2", 0, 200, 100, "TTT-AAA", "1/B", false, true);
records.push(r1);
records.push(r2);
for i in 1..=3 {
let umi = if i == 3 { "NTT-AAA" } else { "TTT-AAA" };
let (r1, r2) =
build_test_pair(&format!("q{}", i + 2), 0, 150, 250, umi, "2/A", true, false);
records.push(r1);
records.push(r2);
}
let (r1, r2) = build_test_pair("q6", 0, 250, 350, "CCC-GGG", "3/B", false, true);
records.push(r1);
records.push(r2);
let input = create_test_bam(records)?;
let output_dir = TempDir::new()?;
let output = output_dir.path().join("output");
let cmd = DuplexMetrics {
input: input.path().to_path_buf(),
output: output.clone(),
min_ab_reads: 1,
min_ba_reads: 1,
duplex_umi_counts: false,
intervals: None,
description: None,
};
cmd.execute("test")?;
let umi_path = format!("{}.umi_counts.txt", output.display());
let umi_metrics: Vec<UmiMetric> = DelimFile::default().read_tsv(&umi_path)?;
assert_eq!(umi_metrics.len(), 4, "Should have 4 unique UMIs");
let aaa = umi_metrics.iter().find(|m| m.umi == "AAA").expect("Should find AAA");
assert_eq!(aaa.raw_observations, 5, "AAA should have 5 raw observations");
assert_eq!(aaa.raw_observations_with_errors, 0, "AAA should have no errors");
assert_eq!(aaa.unique_observations, 2, "AAA should have 2 unique families");
let ttt = umi_metrics.iter().find(|m| m.umi == "TTT").expect("Should find TTT");
assert_eq!(ttt.raw_observations, 5, "TTT should have 5 raw observations");
assert_eq!(ttt.raw_observations_with_errors, 1, "TTT should have 1 error (NTT)");
assert_eq!(ttt.unique_observations, 2, "TTT should have 2 unique families");
let ccc = umi_metrics.iter().find(|m| m.umi == "CCC").expect("Should find CCC");
assert_eq!(ccc.raw_observations, 1);
assert_eq!(ccc.unique_observations, 1);
let ggg = umi_metrics.iter().find(|m| m.umi == "GGG").expect("Should find GGG");
assert_eq!(ggg.raw_observations, 1);
assert_eq!(ggg.unique_observations, 1);
Ok(())
}
#[test]
fn test_generate_accurate_family_size_counts() -> Result<()> {
let mut records = Vec::new();
for i in 1..=2 {
let (r1, r2) =
build_test_pair(&format!("a{i}"), 0, 100, 200, "AAA-TTT", "1/A", true, false);
records.push(r1);
records.push(r2);
}
let (r1, r2) = build_test_pair("b1", 0, 100, 200, "ACG-GGA", "2/A", true, false);
records.push(r1);
records.push(r2);
let (r1, r2) = build_test_pair("c1", 0, 200, 100, "TAT-CGT", "3/B", false, true);
records.push(r1);
records.push(r2);
let (r1, r2) = build_test_pair("d1", 0, 200, 300, "TTT-AAA", "4/A", true, false);
records.push(r1);
records.push(r2);
let (r1, r2) = build_test_pair("d2", 0, 300, 200, "AAA-AAA", "4/B", false, true);
records.push(r1);
records.push(r2);
let (r1, r2) = build_test_pair("e1", 0, 200, 300, "CCC-GGG", "5/A", true, false);
records.push(r1);
records.push(r2);
let (r1, r2) = build_test_pair("e2", 0, 300, 200, "GGG-CCC", "5/B", false, true);
records.push(r1);
records.push(r2);
let (r1, r2) = build_test_pair("f1", 0, 400, 500, "GCG-GAA", "6/A", true, false);
records.push(r1);
records.push(r2);
for i in 1..=2 {
let (r1, r2) =
build_test_pair(&format!("g{i}"), 0, 400, 500, "ACG-CCT", "7/A", true, false);
records.push(r1);
records.push(r2);
}
let (r1, r2) = build_test_pair("g3", 0, 500, 400, "CCT-ACG", "7/B", false, true);
records.push(r1);
records.push(r2);
let input = create_test_bam(records)?;
let output_dir = TempDir::new()?;
let output = output_dir.path().join("output");
let cmd = DuplexMetrics {
input: input.path().to_path_buf(),
output: output.clone(),
min_ab_reads: 1,
min_ba_reads: 1,
duplex_umi_counts: false,
intervals: None,
description: None,
};
cmd.execute("test")?;
let family_path = format!("{}.family_sizes.txt", output.display());
let family_metrics: Vec<FamilySizeMetric> = DelimFile::default().read_tsv(&family_path)?;
let one = family_metrics.iter().find(|m| m.family_size == 1).expect("Should have size 1");
assert_eq!(one.cs_count, 0);
assert_eq!(one.ss_count, 8, "Should have 8 single-strand families of size 1");
assert_eq!(one.ds_count, 3, "Should have 3 duplex families of size 1");
let two = family_metrics.iter().find(|m| m.family_size == 2).expect("Should have size 2");
assert_eq!(two.cs_count, 0);
assert_eq!(two.ss_count, 2, "Should have 2 single-strand families of size 2");
assert_eq!(two.ds_count, 3, "Should have 3 duplex families of size 2");
let three = family_metrics.iter().find(|m| m.family_size == 3).expect("Should have size 3");
assert_eq!(three.cs_count, 0);
assert_eq!(three.ss_count, 0);
assert_eq!(three.ds_count, 1, "Should have 1 duplex family of size 3");
let four = family_metrics.iter().find(|m| m.family_size == 4).expect("Should have size 4");
assert_eq!(four.cs_count, 3, "Should have 3 coordinate families of size 4");
assert_eq!(four.ss_count, 0);
assert_eq!(four.ds_count, 0);
Ok(())
}
#[test]
fn test_generate_accurate_duplex_family_size_counts() -> Result<()> {
let mut records = Vec::new();
let (r1, r2) = build_test_pair("q1", 0, 100, 200, "AAA-ACG", "1/A", true, false);
records.push(r1);
records.push(r2);
let (r1, r2) = build_test_pair("q2", 0, 200, 300, "AAA-ACG", "2/A", true, false);
records.push(r1);
records.push(r2);
let (r1, r2) = build_test_pair("q3", 0, 300, 200, "ACG-AAA", "2/B", false, true);
records.push(r1);
records.push(r2);
for i in 1..=2 {
let (r1, r2) =
build_test_pair(&format!("q{}", i + 3), 0, 400, 300, "AAC-GGG", "3/B", false, true);
records.push(r1);
records.push(r2);
}
let (r1, r2) = build_test_pair("q6", 0, 300, 400, "GGG-AAC", "3/A", true, false);
records.push(r1);
records.push(r2);
for i in 1..=4 {
let (r1, r2) =
build_test_pair(&format!("q{}", i + 6), 0, 400, 500, "GGG-AAC", "4/A", true, false);
records.push(r1);
records.push(r2);
}
for i in 1..=3 {
let (r1, r2) = build_test_pair(
&format!("q{}", i + 10),
0,
500,
400,
"AAC-GGG",
"4/B",
false,
true,
);
records.push(r1);
records.push(r2);
}
for i in 1..=5 {
let (r1, r2) = build_test_pair(
&format!("q{}", i + 13),
0,
600,
700,
"AGT-GCT",
"6/A",
true,
false,
);
records.push(r1);
records.push(r2);
}
for i in 1..=5 {
let (r1, r2) = build_test_pair(
&format!("q{}", i + 18),
0,
700,
600,
"GCT-AGT",
"6/B",
false,
true,
);
records.push(r1);
records.push(r2);
}
let input = create_test_bam(records)?;
let output_dir = TempDir::new()?;
let output = output_dir.path().join("output");
let cmd = DuplexMetrics {
input: input.path().to_path_buf(),
output: output.clone(),
min_ab_reads: 1,
min_ba_reads: 1,
duplex_umi_counts: false,
intervals: None,
description: None,
};
cmd.execute("test")?;
let duplex_family_path = format!("{}.duplex_family_sizes.txt", output.display());
let duplex_metrics: Vec<DuplexFamilySizeMetric> =
DelimFile::default().read_tsv(&duplex_family_path)?;
for metric in &duplex_metrics {
assert!(metric.ab_size >= metric.ba_size, "ab_size should be >= ba_size");
}
assert_eq!(
duplex_metrics.iter().find(|m| m.ab_size == 1 && m.ba_size == 0).map(|m| m.count),
Some(1)
);
assert_eq!(
duplex_metrics.iter().find(|m| m.ab_size == 1 && m.ba_size == 1).map(|m| m.count),
Some(1)
);
assert_eq!(
duplex_metrics.iter().find(|m| m.ab_size == 2 && m.ba_size == 1).map(|m| m.count),
Some(1)
);
assert_eq!(
duplex_metrics.iter().find(|m| m.ab_size == 4 && m.ba_size == 3).map(|m| m.count),
Some(1)
);
assert_eq!(
duplex_metrics.iter().find(|m| m.ab_size == 5 && m.ba_size == 5).map(|m| m.count),
Some(1)
);
Ok(())
}
#[test]
fn test_respect_min_ab_and_min_ba_for_counting_duplexes() -> Result<()> {
let mut records = Vec::new();
let (r1, r2) = build_test_pair("q1", 0, 300, 400, "AAA-GGG", "1/A", true, false);
records.push(r1);
records.push(r2);
let (r1, r2) = build_test_pair("q2", 0, 400, 300, "GGG-AAA", "1/B", false, true);
records.push(r1);
records.push(r2);
let (r1, r2) = build_test_pair("q3", 0, 300, 400, "ACT-TTA", "2/A", true, false);
records.push(r1);
records.push(r2);
for i in 1..=2 {
let (r1, r2) =
build_test_pair(&format!("q{}", i + 3), 0, 400, 300, "TTA-ACT", "2/B", false, true);
records.push(r1);
records.push(r2);
}
for i in 1..=2 {
let (r1, r2) =
build_test_pair(&format!("q{}", i + 5), 0, 300, 400, "CGA-GGT", "3/A", true, false);
records.push(r1);
records.push(r2);
}
for i in 1..=2 {
let (r1, r2) =
build_test_pair(&format!("q{}", i + 7), 0, 400, 300, "GGT-CGA", "3/B", false, true);
records.push(r1);
records.push(r2);
}
let input = create_test_bam(records)?;
{
let output_dir = TempDir::new()?;
let output = output_dir.path().join("output");
let cmd = DuplexMetrics {
input: input.path().to_path_buf(),
output: output.clone(),
min_ab_reads: 1,
min_ba_reads: 1,
duplex_umi_counts: false,
intervals: None,
description: None,
};
cmd.execute("test")?;
let yield_path = format!("{}.duplex_yield_metrics.txt", output.display());
let yield_metrics: Vec<DuplexYieldMetric> =
DelimFile::default().read_tsv(&yield_path)?;
let full_metric = yield_metrics
.iter()
.find(|m| (m.fraction - 1.0).abs() < 0.01)
.expect("Should have 100% fraction");
assert_eq!(full_metric.ds_duplexes, 3, "Should have 3 duplexes with min 1/1");
}
{
let output_dir = TempDir::new()?;
let output = output_dir.path().join("output");
let cmd = DuplexMetrics {
input: input.path().to_path_buf(),
output: output.clone(),
min_ab_reads: 2,
min_ba_reads: 1,
duplex_umi_counts: false,
intervals: None,
description: None,
};
cmd.execute("test")?;
let yield_path = format!("{}.duplex_yield_metrics.txt", output.display());
let yield_metrics: Vec<DuplexYieldMetric> =
DelimFile::default().read_tsv(&yield_path)?;
let full_metric = yield_metrics
.iter()
.find(|m| (m.fraction - 1.0).abs() < 0.01)
.expect("Should have 100% fraction");
assert_eq!(full_metric.ds_duplexes, 2, "Should have 2 duplexes with min 2/1");
}
{
let output_dir = TempDir::new()?;
let output = output_dir.path().join("output");
let cmd = DuplexMetrics {
input: input.path().to_path_buf(),
output: output.clone(),
min_ab_reads: 2,
min_ba_reads: 2,
duplex_umi_counts: false,
intervals: None,
description: None,
};
cmd.execute("test")?;
let yield_path = format!("{}.duplex_yield_metrics.txt", output.display());
let yield_metrics: Vec<DuplexYieldMetric> =
DelimFile::default().read_tsv(&yield_path)?;
let full_metric = yield_metrics
.iter()
.find(|m| (m.fraction - 1.0).abs() < 0.01)
.expect("Should have 100% fraction");
assert_eq!(full_metric.ds_duplexes, 1, "Should have 1 duplex with min 2/2");
}
Ok(())
}
#[test]
fn test_only_count_inserts_overlapping_intervals() -> Result<()> {
let mut records = Vec::new();
let (r1, r2) = build_test_pair("q1", 0, 1000, 1100, "AAA-GGG", "1/A", true, false);
records.push(r1);
records.push(r2);
for i in 1..=2 {
let (r1, r2) = build_test_pair(
&format!("q{}", i + 1),
0,
2000,
2100,
"GGG-AAA",
"2/A",
true,
false,
);
records.push(r1);
records.push(r2);
}
for i in 1..=3 {
let (r1, r2) = build_test_pair(
&format!("q{}", i + 3),
0,
3000,
3100,
"ACT-TTA",
"3/A",
true,
false,
);
records.push(r1);
records.push(r2);
}
for i in 1..=4 {
let (r1, r2) = build_test_pair(
&format!("q{}", i + 6),
1,
4000,
4100,
"TTA-ACT",
"4/A",
true,
false,
);
records.push(r1);
records.push(r2);
}
for i in 1..=5 {
let (r1, r2) = build_test_pair(
&format!("q{}", i + 10),
1,
5000,
5100,
"CGA-GGT",
"5/A",
true,
false,
);
records.push(r1);
records.push(r2);
}
for i in 1..=6 {
let (r1, r2) = build_test_pair(
&format!("q{}", i + 15),
1,
6000,
6100,
"GGT-CGA",
"6/A",
true,
false,
);
records.push(r1);
records.push(r2);
}
let input = create_test_bam(records)?;
let intervals_dir = TempDir::new()?;
let intervals_path = intervals_dir.path().join("intervals.bed");
std::fs::write(&intervals_path, "chr1\t900\t1001\nchr1\t3150\t3500\nchr2\t5050\t6050\n")?;
let output_dir = TempDir::new()?;
let output = output_dir.path().join("output");
let cmd = DuplexMetrics {
input: input.path().to_path_buf(),
output: output.clone(),
min_ab_reads: 1,
min_ba_reads: 1,
duplex_umi_counts: false,
intervals: Some(intervals_path),
description: None,
};
cmd.execute("test")?;
let duplex_family_path = format!("{}.duplex_family_sizes.txt", output.display());
let duplex_metrics: Vec<DuplexFamilySizeMetric> =
DelimFile::default().read_tsv(&duplex_family_path)?;
let size_1_count =
duplex_metrics.iter().find(|m| m.ab_size == 1 && m.ba_size == 0).map_or(0, |m| m.count);
let size_3_count =
duplex_metrics.iter().find(|m| m.ab_size == 3 && m.ba_size == 0).map_or(0, |m| m.count);
let size_5_count =
duplex_metrics.iter().find(|m| m.ab_size == 5 && m.ba_size == 0).map_or(0, |m| m.count);
let size_6_count =
duplex_metrics.iter().find(|m| m.ab_size == 6 && m.ba_size == 0).map_or(0, |m| m.count);
assert_eq!(size_1_count, 1, "Should have 1 family of size 1");
assert_eq!(size_3_count, 1, "Should have 1 family of size 3");
assert_eq!(size_5_count, 1, "Should have 1 family of size 5");
assert_eq!(size_6_count, 1, "Should have 1 family of size 6");
Ok(())
}
#[test]
fn test_downsampling_generates_multiple_fractions() -> Result<()> {
let mut records = Vec::new();
for i in 1..=50 {
let (r1, r2) = build_test_pair(
&format!("q{i}"),
0,
i * 100,
i * 100 + 100,
"AAA-TTT",
&format!("{i}/A"),
true,
false,
);
records.push(r1);
records.push(r2);
}
let input = create_test_bam(records)?;
let output_dir = TempDir::new()?;
let output = output_dir.path().join("output");
let cmd = DuplexMetrics {
input: input.path().to_path_buf(),
output: output.clone(),
min_ab_reads: 1,
min_ba_reads: 1,
duplex_umi_counts: false,
intervals: None,
description: None,
};
cmd.execute("test")?;
let yield_path = format!("{}.duplex_yield_metrics.txt", output.display());
let yield_metrics: Vec<DuplexYieldMetric> = DelimFile::default().read_tsv(&yield_path)?;
assert_eq!(yield_metrics.len(), 20, "Should have 20 sampling fractions");
for i in 1..yield_metrics.len() {
assert!(
yield_metrics[i].fraction > yield_metrics[i - 1].fraction,
"Fractions should be increasing"
);
}
for i in 1..yield_metrics.len() {
assert!(
yield_metrics[i].cs_families >= yield_metrics[i - 1].cs_families,
"CS families should be increasing or equal"
);
}
Ok(())
}
#[test]
fn test_duplex_umi_expected_fraction_calculation() -> Result<()> {
use crate::metrics::duplex::DuplexMetricsCollector;
let mut collector = DuplexMetricsCollector::new(true);
for _ in 0..5 {
collector.record_umi("AAA", 1, 0, true);
collector.record_umi("TTT", 1, 0, true);
collector.record_duplex_umi("AAA-TTT", 1, 0, true);
}
for _ in 0..2 {
collector.record_umi("GGG", 1, 0, true);
collector.record_umi("CCC", 1, 0, true);
collector.record_duplex_umi("GGG-CCC", 1, 0, true);
}
collector.record_umi("GGG", 1, 0, true);
collector.record_umi("AAA", 1, 0, true);
collector.record_duplex_umi("GGG-AAA", 1, 0, true);
for _ in 0..2 {
collector.record_umi("TTT", 1, 0, true);
collector.record_umi("CCC", 1, 0, true);
collector.record_duplex_umi("TTT-CCC", 1, 0, true);
}
let umi_metrics = collector.umi_metrics();
let umi_map: std::collections::HashMap<_, _> =
umi_metrics.iter().map(|m| (m.umi.as_str(), m)).collect();
assert_eq!(umi_map["AAA"].unique_observations, 6, "AAA should appear in 6 families");
assert_eq!(umi_map["TTT"].unique_observations, 7, "TTT should appear in 7 families");
assert_eq!(umi_map["GGG"].unique_observations, 3, "GGG should appear in 3 families");
assert_eq!(umi_map["CCC"].unique_observations, 4, "CCC should appear in 4 families");
let expected_aaa_fraction = 6.0 / 20.0; let expected_ttt_fraction = 7.0 / 20.0; let expected_ggg_fraction = 3.0 / 20.0; let expected_ccc_fraction = 4.0 / 20.0;
assert!(
(umi_map["AAA"].fraction_unique_observations - expected_aaa_fraction).abs() < 0.001
);
assert!(
(umi_map["TTT"].fraction_unique_observations - expected_ttt_fraction).abs() < 0.001
);
assert!(
(umi_map["GGG"].fraction_unique_observations - expected_ggg_fraction).abs() < 0.001
);
assert!(
(umi_map["CCC"].fraction_unique_observations - expected_ccc_fraction).abs() < 0.001
);
let duplex_metrics = collector.duplex_umi_metrics(&umi_metrics);
let duplex_map: std::collections::HashMap<_, _> =
duplex_metrics.iter().map(|m| (m.umi.as_str(), m)).collect();
let expected_aaa_ttt = expected_aaa_fraction * expected_ttt_fraction;
assert!(
(duplex_map["AAA-TTT"].fraction_unique_observations_expected - expected_aaa_ttt).abs()
< 0.001,
"Expected AAA-TTT fraction to be {}, got {}",
expected_aaa_ttt,
duplex_map["AAA-TTT"].fraction_unique_observations_expected
);
let expected_ggg_ccc = expected_ggg_fraction * expected_ccc_fraction;
assert!(
(duplex_map["GGG-CCC"].fraction_unique_observations_expected - expected_ggg_ccc).abs()
< 0.001,
"Expected GGG-CCC fraction to be {}, got {}",
expected_ggg_ccc,
duplex_map["GGG-CCC"].fraction_unique_observations_expected
);
let expected_ggg_aaa = expected_ggg_fraction * expected_aaa_fraction;
assert!(
(duplex_map["GGG-AAA"].fraction_unique_observations_expected - expected_ggg_aaa).abs()
< 0.001,
"Expected GGG-AAA fraction to be {}, got {}",
expected_ggg_aaa,
duplex_map["GGG-AAA"].fraction_unique_observations_expected
);
let expected_ttt_ccc = expected_ttt_fraction * expected_ccc_fraction;
assert!(
(duplex_map["TTT-CCC"].fraction_unique_observations_expected - expected_ttt_ccc).abs()
< 0.001,
"Expected TTT-CCC fraction to be {}, got {}",
expected_ttt_ccc,
duplex_map["TTT-CCC"].fraction_unique_observations_expected
);
Ok(())
}
#[test]
fn test_reject_consensus_bam() -> Result<()> {
use tempfile::NamedTempFile;
let temp_bam = NamedTempFile::new()?;
let bam_path = temp_bam.path().to_path_buf();
use noodles::sam::header::record::value::Map;
use noodles::sam::header::record::value::map::ReferenceSequence;
use std::num::NonZeroUsize;
let header = noodles::sam::Header::builder()
.add_reference_sequence(
"chr1",
Map::<ReferenceSequence>::new(
NonZeroUsize::new(1000).expect("non-zero chromosome length"),
),
)
.build();
let mut b = RawSamBuilder::new();
b.read_name(b"read1")
.flags(flags::PAIRED | flags::FIRST_SEGMENT)
.ref_id(0)
.pos(99)
.mapq(60)
.cigar_ops(&[encode_op(0, 100)])
.sequence(&[b'A'; 100])
.qualities(&[30u8; 100]);
b.add_int_tag(b"cD", 10_i32);
let rec = to_record_buf(b.build());
let mut writer = noodles::bam::io::Writer::new(std::fs::File::create(&bam_path)?);
writer.write_header(&header)?;
writer.write_alignment_record(&header, &rec)?;
writer.try_finish()?;
drop(writer);
let temp_output = tempfile::tempdir()?;
let output_prefix = temp_output.path().join("metrics");
let cmd = DuplexMetrics {
input: bam_path,
output: output_prefix,
min_ab_reads: 1,
min_ba_reads: 1,
duplex_umi_counts: false,
intervals: None,
description: None,
};
let result = cmd.execute("test");
assert!(result.is_err());
let err = result.unwrap_err().to_string();
assert!(
err.contains("appears to contain consensus sequences"),
"Error message should mention consensus sequences, got: {err}"
);
Ok(())
}
#[test]
fn test_default_duplex_metrics_parameters() {
let metrics = DuplexMetrics {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output"),
min_ab_reads: 1,
min_ba_reads: 1,
duplex_umi_counts: false,
intervals: None,
description: None,
};
assert_eq!(metrics.min_ab_reads, 1);
assert_eq!(metrics.min_ba_reads, 1);
assert!(!metrics.duplex_umi_counts);
assert!(metrics.intervals.is_none());
assert!(metrics.description.is_none());
}
#[test]
fn test_duplex_metrics_with_custom_thresholds() {
let metrics = DuplexMetrics {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output"),
min_ab_reads: 3,
min_ba_reads: 2,
duplex_umi_counts: true,
intervals: Some(PathBuf::from("intervals.bed")),
description: Some("Test Sample".to_string()),
};
assert_eq!(metrics.min_ab_reads, 3);
assert_eq!(metrics.min_ba_reads, 2);
assert!(metrics.duplex_umi_counts);
assert_eq!(metrics.intervals, Some(PathBuf::from("intervals.bed")));
}
#[test]
fn test_duplex_metrics_validation_fails_wrong_order() {
let metrics = DuplexMetrics {
input: PathBuf::from("input.bam"),
output: PathBuf::from("output"),
min_ab_reads: 1,
min_ba_reads: 2, duplex_umi_counts: false,
intervals: None,
description: None,
};
assert!(metrics.min_ab_reads < metrics.min_ba_reads);
}
#[test]
fn test_hash_fraction_deterministic() {
let hash1 = compute_hash_fraction("read1");
let hash2 = compute_hash_fraction("read1");
assert!((hash1 - hash2).abs() < f64::EPSILON);
assert!((0.0..=1.0).contains(&hash1));
}
#[test]
fn test_hash_fraction_different_reads() {
let hash1 = compute_hash_fraction("read1");
let hash2 = compute_hash_fraction("read2");
let hash3 = compute_hash_fraction("read3");
assert!((0.0..=1.0).contains(&hash1));
assert!((0.0..=1.0).contains(&hash2));
assert!((0.0..=1.0).contains(&hash3));
assert!((hash1 - hash2).abs() > f64::EPSILON || (hash2 - hash3).abs() > f64::EPSILON);
}
#[test]
fn test_interval_overlap_logic() {
let template = TemplateInfo {
mi: "1/A".to_string(),
rx: "AAA-TTT".to_string(),
ref_name: Some("chr1".to_string()),
position: Some(1000),
end_position: Some(1100),
hash_fraction: 0.5,
};
assert!(overlaps_intervals(&template, &[]));
let intervals = vec![Interval { ref_name: "chr1".to_string(), start: 900, end: 1050 }];
assert!(overlaps_intervals(&template, &intervals));
let intervals = vec![Interval { ref_name: "chr1".to_string(), start: 2000, end: 3000 }];
assert!(!overlaps_intervals(&template, &intervals));
let unmapped = TemplateInfo {
mi: "2/A".to_string(),
rx: "AAA-TTT".to_string(),
ref_name: None,
position: None,
end_position: None,
hash_fraction: 0.5,
};
assert!(!overlaps_intervals(&unmapped, &intervals));
}
#[test]
fn test_parse_intervals_bed_format() -> Result<()> {
let dir = TempDir::new()?;
let path = dir.path().join("intervals.bed");
std::fs::write(&path, "# comment line\nchr1\t100\t200\nchr2\t300\t400\n")?;
let intervals = parse_intervals(&path)?;
assert_eq!(intervals.len(), 2);
assert_eq!(intervals[0].ref_name, "chr1");
assert_eq!(intervals[0].start, 100);
assert_eq!(intervals[0].end, 200);
assert_eq!(intervals[1].ref_name, "chr2");
assert_eq!(intervals[1].start, 300);
assert_eq!(intervals[1].end, 400);
Ok(())
}
#[test]
fn test_parse_intervals_interval_list_format() -> Result<()> {
let dir = TempDir::new()?;
let path = dir.path().join("intervals.interval_list");
std::fs::write(
&path,
"@HD\tVN:1.6\tSO:coordinate\n\
@SQ\tSN:chr1\tLN:10000\n\
@SQ\tSN:chr2\tLN:10000\n\
chr1\t101\t200\t+\tinterval1\n\
chr2\t301\t400\t+\tinterval2\n",
)?;
let intervals = parse_intervals(&path)?;
assert_eq!(intervals.len(), 2);
assert_eq!(intervals[0].ref_name, "chr1");
assert_eq!(intervals[0].start, 100);
assert_eq!(intervals[0].end, 200);
assert_eq!(intervals[1].ref_name, "chr2");
assert_eq!(intervals[1].start, 300);
assert_eq!(intervals[1].end, 400);
Ok(())
}
#[test]
fn test_only_count_inserts_overlapping_interval_list() -> Result<()> {
let mut records = Vec::new();
let (r1, r2) = build_test_pair("q1", 0, 1000, 1100, "AAA-GGG", "1/A", true, false);
records.push(r1);
records.push(r2);
for i in 1..=2 {
let (r1, r2) = build_test_pair(
&format!("q{}", i + 1),
0,
2000,
2100,
"GGG-AAA",
"2/A",
true,
false,
);
records.push(r1);
records.push(r2);
}
for i in 1..=3 {
let (r1, r2) = build_test_pair(
&format!("q{}", i + 3),
0,
3000,
3100,
"ACT-TTA",
"3/A",
true,
false,
);
records.push(r1);
records.push(r2);
}
for i in 1..=4 {
let (r1, r2) = build_test_pair(
&format!("q{}", i + 6),
1,
4000,
4100,
"TTA-ACT",
"4/A",
true,
false,
);
records.push(r1);
records.push(r2);
}
for i in 1..=5 {
let (r1, r2) = build_test_pair(
&format!("q{}", i + 10),
1,
5000,
5100,
"CGA-GGT",
"5/A",
true,
false,
);
records.push(r1);
records.push(r2);
}
for i in 1..=6 {
let (r1, r2) = build_test_pair(
&format!("q{}", i + 15),
1,
6000,
6100,
"GGT-CGA",
"6/A",
true,
false,
);
records.push(r1);
records.push(r2);
}
let input = create_test_bam(records)?;
let intervals_dir = TempDir::new()?;
let intervals_path = intervals_dir.path().join("intervals.interval_list");
std::fs::write(
&intervals_path,
"@HD\tVN:1.6\tSO:coordinate\n\
@SQ\tSN:chr1\tLN:100000\n\
@SQ\tSN:chr2\tLN:100000\n\
chr1\t901\t1001\t+\tinterval1\n\
chr1\t3151\t3500\t+\tinterval2\n\
chr2\t5051\t6050\t+\tinterval3\n",
)?;
let output_dir = TempDir::new()?;
let output = output_dir.path().join("output");
let cmd = DuplexMetrics {
input: input.path().to_path_buf(),
output: output.clone(),
min_ab_reads: 1,
min_ba_reads: 1,
duplex_umi_counts: false,
intervals: Some(intervals_path),
description: None,
};
cmd.execute("test")?;
let duplex_family_path = format!("{}.duplex_family_sizes.txt", output.display());
let duplex_metrics: Vec<DuplexFamilySizeMetric> =
DelimFile::default().read_tsv(&duplex_family_path)?;
let size_1_count =
duplex_metrics.iter().find(|m| m.ab_size == 1 && m.ba_size == 0).map_or(0, |m| m.count);
let size_3_count =
duplex_metrics.iter().find(|m| m.ab_size == 3 && m.ba_size == 0).map_or(0, |m| m.count);
let size_5_count =
duplex_metrics.iter().find(|m| m.ab_size == 5 && m.ba_size == 0).map_or(0, |m| m.count);
let size_6_count =
duplex_metrics.iter().find(|m| m.ab_size == 6 && m.ba_size == 0).map_or(0, |m| m.count);
assert_eq!(size_1_count, 1, "Should have 1 family of size 1");
assert_eq!(size_3_count, 1, "Should have 1 family of size 3");
assert_eq!(size_5_count, 1, "Should have 1 family of size 5");
assert_eq!(size_6_count, 1, "Should have 1 family of size 6");
Ok(())
}
#[test]
fn test_handle_empty_umi_components() -> Result<()> {
let mut records = Vec::new();
let (r1, r2) = build_test_pair("q1", 0, 100, 200, "AAA-TTT", "1/A", true, false);
records.push(r1);
records.push(r2);
let (r1, r2) = build_test_pair("q2", 0, 200, 100, "TTT-AAA", "1/B", false, true);
records.push(r1);
records.push(r2);
let (r1, r2) = build_test_pair("q3", 0, 200, 300, "-CCC", "2/A", true, false);
records.push(r1);
records.push(r2);
let (r1, r2) = build_test_pair("q4", 0, 300, 400, "AAA-", "3/A", true, false);
records.push(r1);
records.push(r2);
let (r1, r2) = build_test_pair("q5", 0, 400, 500, "GGG", "4/A", true, false);
records.push(r1);
records.push(r2);
let (r1, r2) = build_test_pair("q6", 0, 500, 600, "CCC-GGG", "5/A", true, false);
records.push(r1);
records.push(r2);
let (r1, r2) = build_test_pair("q7", 0, 600, 500, "GGG-CCC", "5/B", false, true);
records.push(r1);
records.push(r2);
let input = create_test_bam(records)?;
let output_dir = TempDir::new()?;
let output = output_dir.path().join("output");
let cmd = DuplexMetrics {
input: input.path().to_path_buf(),
output: output.clone(),
min_ab_reads: 1,
min_ba_reads: 1,
duplex_umi_counts: false,
intervals: None,
description: None,
};
cmd.execute("test")?;
let umi_path = format!("{}.umi_counts.txt", output.display());
let umi_metrics: Vec<UmiMetric> = DelimFile::default().read_tsv(&umi_path)?;
assert_eq!(umi_metrics.len(), 4, "Should have 4 valid UMIs");
for metric in &umi_metrics {
assert!(
["AAA", "TTT", "CCC", "GGG"].contains(&metric.umi.as_str()),
"Unexpected UMI: {}",
metric.umi
);
}
let family_path = format!("{}.family_sizes.txt", output.display());
let family_metrics: Vec<FamilySizeMetric> = DelimFile::default().read_tsv(&family_path)?;
let cs_families: usize = family_metrics.iter().map(|m| m.cs_count).sum();
assert_eq!(cs_families, 5, "Should have 5 CS families (including those with invalid UMIs)");
let duplex_path = format!("{}.duplex_family_sizes.txt", output.display());
let duplex_metrics: Vec<DuplexFamilySizeMetric> =
DelimFile::default().read_tsv(&duplex_path)?;
let duplex_count: usize = duplex_metrics
.iter()
.filter(|m| m.ab_size >= 1 && m.ba_size >= 1)
.map(|m| m.count)
.sum();
assert_eq!(duplex_count, 2, "Should have 2 valid duplex families");
Ok(())
}
}