use anyhow::{bail, ensure, Context};
use derive_builder::Builder;
use log::debug;
use rust_lib_reference_genome::reference_genome::ReferenceGenome;
use std::collections::BTreeMap;
use crate::data_types::compare_benchmark::{CompareBenchmark, SequenceBundle};
use crate::data_types::compare_region::CompareRegion;
use crate::data_types::coordinates::Coordinates;
use crate::data_types::phase_enums::{Allele, Haplotype, PhasedZygosity};
use crate::data_types::summary_metrics::SummaryMetrics;
use crate::data_types::variants::{Variant, VariantType};
use crate::exact_gt_optimizer::optimize_gt_alleles;
use crate::parsing::stratifications::Stratifications;
use crate::query_optimizer::{optimize_sequences, OptimizedHaplotypes};
use crate::util::sequence_alignment::wfa_ed;
const SUPPORTED_VARIANT_TYPES: [VariantType; 8] = [
VariantType::Snv,
VariantType::Insertion,
VariantType::Deletion,
VariantType::Indel,
VariantType::TrContraction,
VariantType::TrExpansion,
VariantType::SvDeletion,
VariantType::SvInsertion
];
#[derive(Builder, Clone, Copy, Debug)]
#[builder(default)]
pub struct CompareConfig {
enable_sequences: bool,
enable_exact_shortcut: bool,
max_branch_factor: usize,
}
impl Default for CompareConfig {
fn default() -> Self {
Self {
enable_sequences: true,
enable_exact_shortcut: false,
max_branch_factor: 50,
}
}
}
pub fn solve_compare_region(
problem: &CompareRegion, reference_genome: &ReferenceGenome, compare_config: CompareConfig, stratifications: Option<&Stratifications>
) -> anyhow::Result<CompareBenchmark> {
let problem_id = problem.region_id();
let coordinates = problem.coordinates();
debug!("B#{problem_id} Scanning region #{problem_id}: {coordinates}");
let reference = reference_genome.get_full_chromosome(coordinates.chrom()); let truth_variants = problem.truth_variants();
let raw_truth_zygosity = problem.truth_zygosity();
let query_variants = problem.query_variants();
let raw_query_zygosity = problem.query_zygosity();
let ref_start = coordinates.start() as usize;
let ref_end = coordinates.end() as usize;
let ref_seq = &reference[ref_start..ref_end];
let all_optimized_haplotypes = optimize_sequences(
reference, coordinates,
truth_variants, raw_truth_zygosity,
query_variants, raw_query_zygosity,
compare_config.max_branch_factor
)?;
debug!("B#{problem_id} Found {} equal solutions, scoring by ALT flips...", all_optimized_haplotypes.len());
let containment_regions = if let Some(strat) = stratifications {
let var_coor = problem.var_coordinates();
let first: i32 = var_coor.start().try_into()
.with_context(|| format!("Failed to convert coordinates: {var_coor:?}"))?;
let last: i32 = var_coor.end().try_into()
.with_context(|| format!("Failed to convert coordinates: {var_coor:?}"))?;
assert!(first < last);
let containment_regions = strat.containments(
var_coor.chrom(), first, last - 1
);
Some(containment_regions)
} else {
None
};
let mut best_results = vec![];
for optimized_haplotypes in all_optimized_haplotypes.into_iter() {
if compare_config.enable_exact_shortcut && optimized_haplotypes.is_exact_match() {
debug!("B#{problem_id} exact match identified:");
let mut exact_result = generate_exact_match(
problem,
truth_variants, raw_truth_zygosity,
query_variants, raw_query_zygosity,
&reference[ref_start..ref_end], &optimized_haplotypes
)?;
if compare_config.enable_sequences {
let bundle = SequenceBundle::new(
String::from_utf8(ref_seq.to_vec())?,
String::from_utf8(optimized_haplotypes.truth_seq1().to_vec())?,
String::from_utf8(optimized_haplotypes.truth_seq2().to_vec())?,
String::from_utf8(optimized_haplotypes.query_seq1().to_vec())?,
String::from_utf8(optimized_haplotypes.query_seq2().to_vec())?,
);
exact_result.add_sequence_bundle(bundle);
}
if let Some(cr) = containment_regions {
exact_result.add_containment_regions(cr);
}
return Ok(exact_result);
}
let truth_zyg = optimized_haplotypes.truth_zygosity();
let query_zyg = optimized_haplotypes.query_zygosity();
let (truth_hap1, truth_hap2): (Vec<Allele>, Vec<Allele>) = truth_zyg.iter()
.map(|z| z.decompose_alleles())
.unzip();
let (query_hap1, query_hap2): (Vec<Allele>, Vec<Allele>) = query_zyg.iter()
.map(|z| z.decompose_alleles())
.unzip();
let hap1_compare = optimize_gt_alleles(
reference, coordinates,
truth_variants, &truth_hap1,
query_variants, &query_hap1
)?;
let hap2_compare = optimize_gt_alleles(
reference, coordinates,
truth_variants, &truth_hap2,
query_variants, &query_hap2
)?;
let mut truth_stats = compare_expected_observed(
problem_id,
optimized_haplotypes.ed1(),
optimized_haplotypes.ed2(),
truth_variants,
&truth_hap1,
hap1_compare.truth_alleles(),
&truth_hap2,
hap2_compare.truth_alleles()
)?;
if compare_config.enable_sequences {
let bundle = SequenceBundle::new(
String::from_utf8(ref_seq.to_vec())?,
String::from_utf8(optimized_haplotypes.truth_seq1().to_vec())?,
String::from_utf8(optimized_haplotypes.truth_seq2().to_vec())?,
String::from_utf8(optimized_haplotypes.query_seq1().to_vec())?,
String::from_utf8(optimized_haplotypes.query_seq2().to_vec())?,
);
truth_stats.add_sequence_bundle(bundle);
}
let query_stats = compare_expected_observed(
problem_id,
optimized_haplotypes.ed1(),
optimized_haplotypes.ed2(),
query_variants,
&query_hap1,
hap1_compare.query_alleles(),
&query_hap2,
hap2_compare.query_alleles()
)?;
best_results.push((optimized_haplotypes, hap1_compare, hap2_compare, truth_stats, query_stats));
}
let (optimized_haplotypes, h1c, h2c, mut truth_stats, query_stats) = best_results.into_iter()
.min_by_key(|(_oh, h1c, h2c, _ts, _qs)| h1c.num_errors() + h2c.num_errors()).unwrap();
debug!("Lowest cost by ALT flips: {}", h1c.num_errors()+h2c.num_errors());
truth_stats.add_swap_benchmark(&query_stats)?;
add_basepair_stats(problem, reference_genome, &mut truth_stats, &optimized_haplotypes)?;
add_record_basepair_stats(problem, &mut truth_stats)?;
if let Some(cr) = containment_regions {
truth_stats.add_containment_regions(cr);
}
Ok(truth_stats)
}
#[allow(clippy::too_many_arguments)]
fn compare_expected_observed(
problem_id: u64, ed1: usize, ed2: usize,
variants: &[Variant],
exp_hap1: &[Allele], obs_hap1: &[Allele],
exp_hap2: &[Allele], obs_hap2: &[Allele],
) -> anyhow::Result<CompareBenchmark> {
ensure!(variants.len() == exp_hap1.len(), "all variants and haplotype alleles must be equal length");
ensure!(variants.len() == obs_hap1.len(), "all variants and haplotype alleles must be equal length");
ensure!(variants.len() == exp_hap2.len(), "all variants and haplotype alleles must be equal length");
ensure!(variants.len() == obs_hap2.len(), "all variants and haplotype alleles must be equal length");
let mut benchmark = CompareBenchmark::new(
problem_id, ed1, ed2
);
let zip1 = exp_hap1.iter().cloned().zip(obs_hap1.iter().cloned());
let zip2 = exp_hap2.iter().cloned().zip(obs_hap2.iter().cloned());
for (v, ((exp_h1, obs_h1), (exp_h2, obs_h2))) in variants.iter().zip(zip1.zip(zip2)) {
let exp_count = exp_h1.to_allele_count() + exp_h2.to_allele_count();
let obs_count = obs_h1.to_allele_count() + obs_h2.to_allele_count();
assert!(exp_count >= obs_count);
benchmark.add_truth_zygosity(v, exp_count, obs_count)?;
}
Ok(benchmark)
}
fn add_basepair_stats(
problem: &CompareRegion, reference_genome: &ReferenceGenome,
bench_result: &mut CompareBenchmark, optimized_haps: &OptimizedHaplotypes
) -> anyhow::Result<()> {
let problem_id = problem.region_id();
let coordinates = problem.coordinates();
let reference = reference_genome.get_full_chromosome(coordinates.chrom()); let ref_start = coordinates.start() as usize;
let ref_end = coordinates.end() as usize;
let truth_variants = problem.truth_variants();
let truth_zygosity = optimized_haps.truth_zygosity();
let query_variants = problem.query_variants();
let query_zygosity = optimized_haps.query_zygosity();
for haplotype in [Haplotype::Hap1, Haplotype::Hap2] {
let (truth_seq, truth_ed) = generate_haplotype_sequence(
reference_genome, coordinates,
truth_variants, truth_zygosity, haplotype
)?;
let (query_seq, query_ed) = generate_haplotype_sequence(
reference_genome, coordinates,
query_variants, query_zygosity, haplotype
)?;
let p_truth_seq = if haplotype == Haplotype::Hap1 { optimized_haps.truth_seq1() } else { optimized_haps.truth_seq2() };
assert_eq!(&truth_seq, p_truth_seq);
let p_query_seq = if haplotype == Haplotype::Hap1 { optimized_haps.query_seq1() } else { optimized_haps.query_seq2() };
assert_eq!(&query_seq, p_query_seq);
let align_metrics = perform_basepair_compare(
&reference[ref_start..ref_end],
&truth_seq, &query_seq
)?;
debug!("B#{problem_id} Align metrics: {align_metrics:?}");
bench_result.add_basepair_metrics(align_metrics, None);
let truth_vs = 2 * truth_ed as u64;
let query_vs = 2 * query_ed as u64;
let skip_metrics = SummaryMetrics::new(0, truth_vs, 0, query_vs);
bench_result.add_basepair_metrics(skip_metrics, None);
for &filter_type in SUPPORTED_VARIANT_TYPES.iter() {
let (filt_qv, filt_qz): (Vec<Variant>, Vec<PhasedZygosity>) = query_variants.iter().zip(query_zygosity.iter())
.filter_map(|(qv, &qz)| if qv.variant_type() == filter_type {
Some((qv.clone(), qz))
} else {
None
})
.unzip();
let (query_tp, query_fp) = if !filt_qv.is_empty() {
let (filtered_query, failed_ed) = generate_haplotype_sequence(
reference_genome, coordinates, &filt_qv, &filt_qz, haplotype
).with_context(|| format!("Error while generating sequence for filtered query haplotype 1: {filter_type:?}"))?;
let failed_vs = 2 * failed_ed as u64;
let filt_align_metrics = perform_basepair_compare(
&reference[ref_start..ref_end],
&truth_seq, &filtered_query
)?;
(filt_align_metrics.query_tp, filt_align_metrics.query_fp + failed_vs) } else {
(0, 0)
};
let (filt_tv, filt_tz): (Vec<Variant>, Vec<PhasedZygosity>) = truth_variants.iter().zip(truth_zygosity.iter())
.filter_map(|(tv, &tz)| if tv.variant_type() == filter_type {
Some((tv.clone(), tz))
} else {
None
})
.unzip();
let (truth_tp, truth_fn) = if !filt_tv.is_empty() {
let (filtered_truth, failed_ed) = generate_haplotype_sequence(
reference_genome, coordinates, &filt_tv, &filt_tz, haplotype
).with_context(|| format!("Error while generating sequence for filtered truth haplotype 1: {filter_type:?}"))?;
let failed_vs = 2 * failed_ed as u64;
let filt_align_metrics = perform_basepair_compare(
&reference[ref_start..ref_end],
&filtered_truth, &query_seq
)?;
(filt_align_metrics.truth_tp, filt_align_metrics.truth_fn + failed_vs)
} else {
(0, 0)
};
let vt_align_metrics = SummaryMetrics::new(
truth_tp, truth_fn, query_tp, query_fp
);
debug!("\t{filter_type:?} => {vt_align_metrics:?}");
bench_result.add_basepair_metrics(vt_align_metrics, Some(filter_type));
}
}
Ok(())
}
fn add_record_basepair_stats(
problem: &CompareRegion, bench_result: &mut CompareBenchmark
) -> anyhow::Result<()> {
let mut truth_total = 0;
let mut truth_total_by_vt: BTreeMap<VariantType, u64> = Default::default();
for (variant, zygosity) in problem.truth_variants().iter().zip(problem.truth_zygosity().iter()) {
let zyg_count = zygosity.to_allele_count() as u64;
let count_value = zyg_count * variant.raw_allele_space() as u64;
let vt = variant.variant_type();
*truth_total_by_vt.entry(vt).or_insert(0) += count_value;
truth_total += count_value;
}
let mut query_total = 0;
let mut query_total_by_vt: BTreeMap<VariantType, u64> = Default::default();
for (variant, zygosity) in problem.query_variants().iter().zip(problem.query_zygosity().iter()) {
let zyg_count = zygosity.to_allele_count() as u64;
let count_value = zyg_count * variant.raw_allele_space() as u64;
let vt = variant.variant_type();
*query_total_by_vt.entry(vt).or_insert(0) += count_value;
query_total += count_value;
}
let basepair_metrics = bench_result.group_metrics().joint_metrics().basepair();
let truth_fn = basepair_metrics.truth_fn;
let query_fp = basepair_metrics.query_fp;
let truth_tp = 2*truth_total - truth_fn;
let query_tp = 2*query_total - query_fp;
ensure!(truth_tp >= basepair_metrics.truth_tp, "Truth TP is less than basepair TP");
ensure!(query_tp >= basepair_metrics.query_tp, "Query TP is less than basepair TP");
let alt_basepair_metrics = SummaryMetrics::new(truth_tp, truth_fn, query_tp, query_fp);
bench_result.add_record_bp_metrics(alt_basepair_metrics, None);
let variant_metrics = bench_result.group_metrics().variant_metrics().clone();
for (vt, vt_metrics) in variant_metrics.iter() {
let truth_total = truth_total_by_vt.get(vt).copied().unwrap_or_default();
let query_total = query_total_by_vt.get(vt).copied().unwrap_or_default();
let vt_basepair_metrics = vt_metrics.basepair();
let truth_fn = vt_basepair_metrics.truth_fn;
let query_fp = vt_basepair_metrics.query_fp;
let truth_tp = 2*truth_total - truth_fn;
let query_tp = 2*query_total - query_fp;
let alt_basepair_metrics = SummaryMetrics::new(truth_tp, truth_fn, query_tp, query_fp);
bench_result.add_record_bp_metrics(alt_basepair_metrics, Some(*vt));
}
Ok(())
}
#[allow(clippy::too_many_arguments)]
fn generate_exact_match(
problem: &CompareRegion,
truth_variants: &[Variant], truth_zygosity: &[PhasedZygosity],
query_variants: &[Variant], query_zygosity: &[PhasedZygosity],
reference_sequence: &[u8], optimized_haps: &OptimizedHaplotypes
) -> anyhow::Result<CompareBenchmark> {
let mut bench_result = CompareBenchmark::new(problem.region_id(), 0, 0);
let expected_zyg_counts = generate_expected_zyg_counts(truth_zygosity);
for (&ev, variant) in expected_zyg_counts.iter().zip(truth_variants.iter()) {
debug!("\t{ev}, ={ev}, {variant:?}");
bench_result.add_truth_zygosity(variant, ev, ev)
.with_context(|| format!("Failed to add summary stats for variant {variant:?}"))?;
}
let expected_query_counts = generate_expected_zyg_counts(query_zygosity);
for (&ev, variant) in expected_query_counts.iter().zip(query_variants.iter()) {
debug!("\t{ev}, ={ev}, {variant:?}");
bench_result.add_query_zygosity(variant, ev, ev)
.with_context(|| format!("Failed to add summary stats for variant {variant:?}"))?;
}
assert_eq!(optimized_haps.truth_seq1(), optimized_haps.query_seq1());
assert_eq!(optimized_haps.truth_seq2(), optimized_haps.query_seq2());
let ed1 = wfa_ed(reference_sequence, optimized_haps.truth_seq1())?;
let ed2 = wfa_ed(reference_sequence, optimized_haps.truth_seq2())?;
let joint_ed_score = 2 * (ed1 + ed2) as u64;
let shared_metrics = SummaryMetrics::new(joint_ed_score, 0, joint_ed_score, 0);
bench_result.add_basepair_metrics(shared_metrics, None);
for (&ev, variant) in expected_zyg_counts.iter().zip(truth_variants.iter()) {
let ref_alt_dist = 2 * variant.alt_ed()? as u64;
let var_metrics = SummaryMetrics::new((ev as u64)*ref_alt_dist, 0, 0, 0);
let variant_type = variant.variant_type();
bench_result.add_basepair_metrics(var_metrics, Some(variant_type));
}
let expected_query_counts = generate_expected_zyg_counts(query_zygosity);
for (&ev, variant) in expected_query_counts.iter().zip(query_variants.iter()) {
let ref_alt_dist = 2 * variant.alt_ed()? as u64;
let var_metrics = SummaryMetrics::new(0, 0, (ev as u64)*ref_alt_dist, 0);
let variant_type = variant.variant_type();
bench_result.add_basepair_metrics(var_metrics, Some(variant_type));
}
Ok(bench_result)
}
fn perform_basepair_compare(
reference_sequence: &[u8],
truth_sequence: &[u8], query_sequence: &[u8]
) -> anyhow::Result<SummaryMetrics> {
let ed_ref_truth = 2 * wfa_ed(reference_sequence, truth_sequence)? as u64;
let ed_ref_query = 2 * wfa_ed(reference_sequence, query_sequence)? as u64;
let ed_truth_query = 2 * wfa_ed(truth_sequence, query_sequence)? as u64;
let tp_shared = (ed_ref_truth + ed_ref_query - ed_truth_query) / 2;
let truth_fn = ed_ref_truth - tp_shared;
let query_fp = ed_ref_query - tp_shared;
let summary_metrics = SummaryMetrics {
truth_tp: tp_shared,
truth_fn,
query_tp: tp_shared,
query_fp
};
Ok(summary_metrics)
}
fn generate_haplotype_sequence(
reference_genome: &ReferenceGenome, region: &Coordinates, variants: &[Variant], zygosities: &[PhasedZygosity], haplotype: Haplotype
) -> anyhow::Result<(Vec<u8>, usize)> {
let mut alleles = Vec::<Allele>::with_capacity(zygosities.len());
for zyg in zygosities.iter() {
let (a1, a2) = match *zyg {
PhasedZygosity::Unknown => bail!("Unknown zygosity encountered in variant"),
PhasedZygosity::HomozygousReference => (Allele::Reference, Allele::Reference),
PhasedZygosity::UnphasedHeterozygous => (Allele::Reference, Allele::Alternate),
PhasedZygosity::PhasedHet01 => (Allele::Reference, Allele::Alternate),
PhasedZygosity::PhasedHet10 => (Allele::Alternate, Allele::Reference),
PhasedZygosity::HomozygousAlternate => (Allele::Alternate, Allele::Alternate),
};
let allele = match haplotype {
Haplotype::Hap1 => a1,
Haplotype::Hap2 => a2,
};
alleles.push(allele);
}
generate_allele_sequence(reference_genome, region, variants, &alleles)
}
fn generate_allele_sequence(
reference_genome: &ReferenceGenome, region: &Coordinates, variants: &[Variant], alleles: &[Allele]
) -> anyhow::Result<(Vec<u8>, usize)> {
if variants.len() != alleles.len() {
bail!("variants and alleles must be the same length");
}
let chrom = region.chrom();
let mut current_ref_position = region.start() as usize;
let mut current_sequence = vec![];
let mut failed_ed = 0;
for (variant, &allele) in variants.iter().zip(alleles.iter()) {
if allele == Allele::Reference && variant.convert_index(0) == 0 {
continue;
}
let vpos = variant.position() as usize;
if vpos < current_ref_position {
assert_eq!(variant.convert_index(0), 0); failed_ed += variant.alt_ed()?;
continue;
}
current_sequence.extend_from_slice(
reference_genome.get_slice(chrom, current_ref_position, vpos)
);
current_ref_position = vpos;
let variant_sequence = match allele {
Allele::Unknown => bail!("Allele::Unknown encountered"),
Allele::Reference => variant.allele0(),
Allele::Alternate => variant.allele1(),
};
current_sequence.extend_from_slice(variant_sequence);
current_ref_position += variant.ref_len();
}
let end_pos = region.end() as usize;
current_sequence.extend_from_slice(
reference_genome.get_slice(chrom, current_ref_position, end_pos)
);
Ok((current_sequence, failed_ed))
}
fn generate_expected_zyg_counts(zygosities: &[PhasedZygosity]) -> Vec<u8> {
zygosities.iter()
.map(|z| {
match *z {
PhasedZygosity::Unknown => todo!("handle Unknown zygosity"),
PhasedZygosity::HomozygousReference => 0,
PhasedZygosity::UnphasedHeterozygous |
PhasedZygosity::PhasedHet01 |
PhasedZygosity::PhasedHet10 => 1,
PhasedZygosity::HomozygousAlternate => 2
}
})
.collect()
}
#[cfg(test)]
mod tests {
use crate::data_types::summary_metrics::{SummaryGtMetrics, SummaryMetrics};
use crate::data_types::variant_metrics::{VariantMetrics, VariantSource};
use super::*;
fn generate_simple_reference() -> ReferenceGenome {
let mut ref_genome = ReferenceGenome::empty_reference();
ref_genome.add_contig(
"mock_chr1".to_string(), "ACCGTTACCAGGACTTGACAAACCG"
).unwrap();
ref_genome
}
#[test]
fn test_generate_haplotype_sequence_001() {
let reference_genome = generate_simple_reference();
let region = Coordinates::new("mock_chr1".to_string(), 5, 15); let variants = [
Variant::new_snv(0, 10, b"G".to_vec(), b"C".to_vec()).unwrap()
];
let zygosities = [
PhasedZygosity::PhasedHet10
];
let (hap1_test, ed1) = generate_haplotype_sequence(
&reference_genome, ®ion, &variants, &zygosities, Haplotype::Hap1
).unwrap();
assert_eq!(&hap1_test, b"TACCACGACT"); assert_eq!(ed1, 0);
let (hap2_test, ed2) = generate_haplotype_sequence(
&reference_genome, ®ion, &variants, &zygosities, Haplotype::Hap2
).unwrap();
assert_eq!(&hap2_test, b"TACCAGGACT"); assert_eq!(ed2, 0);
}
#[test]
fn test_generate_haplotype_sequence_002() {
let reference_genome = generate_simple_reference();
let region = Coordinates::new("mock_chr1".to_string(), 0, 10); let variants = [
Variant::new_deletion(0, 3, b"GT".to_vec(), b"G".to_vec()).unwrap(),
Variant::new_snv(0, 4, b"T".to_vec(), b"G".to_vec()).unwrap()
];
let zygosities = [
PhasedZygosity::PhasedHet10,
PhasedZygosity::PhasedHet01
];
let (hap1_test, ed1) = generate_haplotype_sequence(
&reference_genome, ®ion, &variants, &zygosities, Haplotype::Hap1
).unwrap();
assert_eq!(&hap1_test, b"ACCGTACCA"); assert_eq!(ed1, 0);
let (hap2_test, ed2) = generate_haplotype_sequence(
&reference_genome, ®ion, &variants, &zygosities, Haplotype::Hap2
).unwrap();
assert_eq!(&hap2_test, b"ACCGGTACCA"); assert_eq!(ed2, 0);
}
#[test]
fn test_generate_haplotype_sequence_conflict() {
let reference_genome = generate_simple_reference();
let region = Coordinates::new("mock_chr1".to_string(), 0, 10); let variants = [
Variant::new_deletion(0, 3, b"GT".to_vec(), b"G".to_vec()).unwrap(),
Variant::new_snv(0, 4, b"T".to_vec(), b"G".to_vec()).unwrap()
];
let zygosities = [
PhasedZygosity::PhasedHet01,
PhasedZygosity::HomozygousAlternate
];
let (hap1_test, ed1) = generate_haplotype_sequence(
&reference_genome, ®ion, &variants, &zygosities, Haplotype::Hap1
).unwrap();
assert_eq!(&hap1_test, b"ACCGGTACCA"); assert_eq!(ed1, 0);
let (hap2_test, ed2) = generate_haplotype_sequence(
&reference_genome, ®ion, &variants, &zygosities, Haplotype::Hap2
).unwrap();
assert_eq!(&hap2_test, b"ACCGTACCA"); assert_eq!(ed2, 1);
}
#[test]
fn test_solve_compare_region_simple_snv() {
let reference_genome = generate_simple_reference();
let coordinates = Coordinates::new("mock_chr1".to_string(), 0, 10); let truth_variants = vec![
Variant::new_snv(0, 2, b"C".to_vec(), b"G".to_vec()).unwrap()
];
let truth_zygosity = vec![
PhasedZygosity::PhasedHet10
];
let query_variants = truth_variants.clone();
let query_zygosity = truth_zygosity.clone();
let problem = CompareRegion::new(
0, coordinates, truth_variants, truth_zygosity, query_variants, query_zygosity
).unwrap();
let result = solve_compare_region(&problem, &reference_genome, Default::default(), None).unwrap();
assert_eq!(result.total_ed(), 0); let group_metrics = result.group_metrics();
assert_eq!(*group_metrics.joint_metrics().gt(), SummaryGtMetrics::new(1, 0, 1, 0, 0, 0));
assert_eq!(*group_metrics.joint_metrics().hap(), SummaryMetrics::new(1, 0, 1, 0));
assert_eq!(*group_metrics.joint_metrics().basepair(), SummaryMetrics::new(2*1, 0, 2*1, 0));
assert_eq!(result.truth_variant_data(), &[VariantMetrics::new(VariantSource::Truth, 1, 1).unwrap()]);
assert_eq!(result.query_variant_data(), &[VariantMetrics::new(VariantSource::Query, 1, 1).unwrap()]);
let sequence_bundle = result.sequence_bundle().unwrap();
assert_eq!(&sequence_bundle.ref_seq, "ACCGTTACCA");
assert_eq!(&sequence_bundle.truth_seq1, "ACGGTTACCA");
assert_eq!(&sequence_bundle.query_seq1, "ACGGTTACCA");
assert_eq!(&sequence_bundle.truth_seq2, "ACCGTTACCA");
assert_eq!(&sequence_bundle.query_seq2, "ACCGTTACCA");
}
#[test]
fn test_solve_compare_region_double_single() {
let reference_genome = generate_simple_reference();
let coordinates = Coordinates::new("mock_chr1".to_string(), 0, 10); let truth_variants = vec![
Variant::new_snv(0, 2, b"C".to_vec(), b"G".to_vec()).unwrap(),
Variant::new_snv(0, 3, b"G".to_vec(), b"T".to_vec()).unwrap()
];
let truth_zygosity = vec![
PhasedZygosity::PhasedHet10,
PhasedZygosity::PhasedHet10
];
let query_variants = vec![
Variant::new_indel(0, 2, b"CG".to_vec(), b"GT".to_vec()).unwrap()
];
let query_zygosity = vec![
PhasedZygosity::PhasedHet01
];
let problem = CompareRegion::new(
0, coordinates, truth_variants, truth_zygosity, query_variants, query_zygosity
).unwrap();
let result = solve_compare_region(&problem, &reference_genome, Default::default(), None).unwrap();
assert_eq!(result.total_ed(), 0); let group_metrics = result.group_metrics();
assert_eq!(*group_metrics.joint_metrics().gt(), SummaryGtMetrics::new(2, 0, 1, 0, 0, 0)); assert_eq!(*group_metrics.joint_metrics().hap(), SummaryMetrics::new(2, 0, 1, 0));
assert_eq!(*group_metrics.joint_metrics().basepair(), SummaryMetrics::new(2*2, 0, 2*2, 0)); assert_eq!(result.truth_variant_data(), &[
VariantMetrics::new(VariantSource::Truth, 1, 1).unwrap(),
VariantMetrics::new(VariantSource::Truth, 1, 1).unwrap()
]);
assert_eq!(result.query_variant_data(), &[VariantMetrics::new(VariantSource::Query, 1, 1).unwrap()]);
let sequence_bundle = result.sequence_bundle().unwrap();
assert_eq!(&sequence_bundle.ref_seq, "ACCGTTACCA");
assert_eq!(&sequence_bundle.truth_seq1, "ACGTTTACCA");
assert_eq!(&sequence_bundle.query_seq1, "ACGTTTACCA");
assert_eq!(&sequence_bundle.truth_seq2, "ACCGTTACCA");
assert_eq!(&sequence_bundle.query_seq2, "ACCGTTACCA");
}
#[test]
fn test_solve_compare_region_single_double() {
let reference_genome = generate_simple_reference();
let coordinates = Coordinates::new("mock_chr1".to_string(), 0, 10); let truth_variants = vec![
Variant::new_indel(0, 2, b"CG".to_vec(), b"GT".to_vec()).unwrap()
];
let truth_zygosity = vec![
PhasedZygosity::PhasedHet01
];
let query_variants = vec![
Variant::new_snv(0, 2, b"C".to_vec(), b"G".to_vec()).unwrap(),
Variant::new_snv(0, 3, b"G".to_vec(), b"T".to_vec()).unwrap()
];
let query_zygosity = vec![
PhasedZygosity::PhasedHet10,
PhasedZygosity::PhasedHet10
];
let problem = CompareRegion::new(
0, coordinates, truth_variants, truth_zygosity, query_variants, query_zygosity
).unwrap();
let result = solve_compare_region(&problem, &reference_genome, Default::default(), None).unwrap();
assert_eq!(result.total_ed(), 0); let group_metrics = result.group_metrics();
assert_eq!(*group_metrics.joint_metrics().gt(), SummaryGtMetrics::new(1, 0, 2, 0, 0, 0)); assert_eq!(*group_metrics.joint_metrics().hap(), SummaryMetrics::new(1, 0, 2, 0));
assert_eq!(*group_metrics.joint_metrics().basepair(), SummaryMetrics::new(2*2, 0, 2*2, 0)); assert_eq!(result.truth_variant_data(), &[VariantMetrics::new(VariantSource::Truth, 1, 1).unwrap()]);
assert_eq!(result.query_variant_data(), &[
VariantMetrics::new(VariantSource::Query, 1, 1).unwrap(),
VariantMetrics::new(VariantSource::Query, 1, 1).unwrap()
]);
let sequence_bundle = result.sequence_bundle().unwrap();
assert_eq!(&sequence_bundle.ref_seq, "ACCGTTACCA");
assert_eq!(&sequence_bundle.truth_seq1, "ACCGTTACCA");
assert_eq!(&sequence_bundle.query_seq1, "ACCGTTACCA");
assert_eq!(&sequence_bundle.truth_seq2, "ACGTTTACCA");
assert_eq!(&sequence_bundle.query_seq2, "ACGTTTACCA");
}
#[test]
fn test_solve_compare_region_insertion_shift() {
let reference_genome = generate_simple_reference();
let coordinates = Coordinates::new("mock_chr1".to_string(), 0, 10); let truth_variants = vec![
Variant::new_insertion(0, 0, b"A".to_vec(), b"AC".to_vec()).unwrap()
];
let truth_zygosity = vec![
PhasedZygosity::HomozygousAlternate
];
let query_variants = vec![
Variant::new_insertion(0, 2, b"C".to_vec(), b"CC".to_vec()).unwrap()
];
let query_zygosity = vec![
PhasedZygosity::HomozygousAlternate,
];
let problem = CompareRegion::new(
0, coordinates, truth_variants, truth_zygosity, query_variants, query_zygosity
).unwrap();
let result = solve_compare_region(&problem, &reference_genome, Default::default(), None).unwrap();
assert_eq!(result.total_ed(), 0); let group_metrics = result.group_metrics();
assert_eq!(*group_metrics.joint_metrics().gt(), SummaryGtMetrics::new(1, 0, 1, 0, 0, 0)); assert_eq!(*group_metrics.joint_metrics().hap(), SummaryMetrics::new(2, 0, 2, 0)); assert_eq!(*group_metrics.joint_metrics().basepair(), SummaryMetrics::new(2*2, 0, 2*2, 0)); assert_eq!(result.truth_variant_data(), &[VariantMetrics::new(VariantSource::Truth, 2, 2).unwrap()]);
assert_eq!(result.query_variant_data(), &[VariantMetrics::new(VariantSource::Query, 2, 2).unwrap()]);
let sequence_bundle = result.sequence_bundle().unwrap();
assert_eq!(&sequence_bundle.ref_seq, "ACCGTTACCA");
assert_eq!(&sequence_bundle.truth_seq1, "ACCCGTTACCA");
assert_eq!(&sequence_bundle.query_seq1, "ACCCGTTACCA");
assert_eq!(&sequence_bundle.truth_seq2, "ACCCGTTACCA");
assert_eq!(&sequence_bundle.query_seq2, "ACCCGTTACCA");
}
#[test]
fn test_solve_compare_region_simple_fn() {
let reference_genome = generate_simple_reference();
let coordinates = Coordinates::new("mock_chr1".to_string(), 0, 10); let truth_variants = vec![
Variant::new_snv(0, 2, b"C".to_vec(), b"G".to_vec()).unwrap()
];
let truth_zygosity = vec![
PhasedZygosity::PhasedHet10
];
let query_variants = vec![];
let query_zygosity = vec![];
let problem = CompareRegion::new(
0, coordinates, truth_variants, truth_zygosity, query_variants, query_zygosity
).unwrap();
let result = solve_compare_region(&problem, &reference_genome, Default::default(), None).unwrap();
assert_eq!(result.total_ed(), 1); let group_metrics = result.group_metrics();
assert_eq!(*group_metrics.joint_metrics().gt(), SummaryGtMetrics::new(0, 1, 0, 0, 0, 0));
assert_eq!(*group_metrics.joint_metrics().hap(), SummaryMetrics::new(0, 1, 0, 0));
assert_eq!(*group_metrics.joint_metrics().basepair(), SummaryMetrics::new(0, 2*1, 0, 0)); assert_eq!(result.truth_variant_data(), &[VariantMetrics::new(VariantSource::Truth, 1, 0).unwrap()]);
assert_eq!(result.query_variant_data(), &[]);
let sequence_bundle = result.sequence_bundle().unwrap();
assert_eq!(&sequence_bundle.ref_seq, "ACCGTTACCA");
assert_eq!(&sequence_bundle.truth_seq1, "ACGGTTACCA");
assert_eq!(&sequence_bundle.query_seq1, "ACCGTTACCA");
assert_eq!(&sequence_bundle.truth_seq2, "ACCGTTACCA");
assert_eq!(&sequence_bundle.query_seq2, "ACCGTTACCA");
}
#[test]
fn test_solve_compare_region_complex_001() {
let reference_genome = generate_simple_reference();
let coordinates = Coordinates::new("mock_chr1".to_string(), 0, 10); let truth_variants = vec![
Variant::new_snv(0, 2, b"C".to_vec(), b"G".to_vec()).unwrap(),
Variant::new_snv(0, 4, b"T".to_vec(), b"C".to_vec()).unwrap(),
Variant::new_snv(0, 6, b"A".to_vec(), b"C".to_vec()).unwrap()
];
let truth_zygosity = vec![
PhasedZygosity::PhasedHet10,
PhasedZygosity::PhasedHet01,
PhasedZygosity::PhasedHet10
];
let query_variants = vec![
Variant::new_snv(0, 2, b"C".to_vec(), b"G".to_vec()).unwrap(),
Variant::new_snv(0, 6, b"A".to_vec(), b"C".to_vec()).unwrap()
];
let query_zygosity = vec![
PhasedZygosity::HomozygousAlternate,
PhasedZygosity::PhasedHet01
];
let problem = CompareRegion::new(
0, coordinates, truth_variants, truth_zygosity, query_variants, query_zygosity
).unwrap();
let result = solve_compare_region(&problem, &reference_genome, Default::default(), None).unwrap();
assert_eq!(result.total_ed(), 2); let group_metrics = result.group_metrics();
assert_eq!(*group_metrics.joint_metrics().gt(), SummaryGtMetrics::new(2, 1, 1, 1, 0, 1));
assert_eq!(*group_metrics.joint_metrics().hap(), SummaryMetrics::new(2, 1, 2, 1));
assert_eq!(*group_metrics.joint_metrics().basepair(), SummaryMetrics::new(2*2, 2*1, 2*2, 2*1)); assert_eq!(result.truth_variant_data(), &[
VariantMetrics::new(VariantSource::Truth, 1, 1).unwrap(), VariantMetrics::new(VariantSource::Truth, 1, 0).unwrap(), VariantMetrics::new(VariantSource::Truth, 1, 1).unwrap()
]);
assert_eq!(result.query_variant_data(), &[
VariantMetrics::new(VariantSource::Query, 1, 2).unwrap(), VariantMetrics::new(VariantSource::Query, 1, 1).unwrap()
]);
let sequence_bundle = result.sequence_bundle().unwrap();
assert_eq!(&sequence_bundle.ref_seq, "ACCGTTACCA");
assert_eq!(&sequence_bundle.truth_seq1, "ACGGTTCCCA");
assert_eq!(&sequence_bundle.query_seq1, "ACGGTTCCCA");
assert_eq!(&sequence_bundle.truth_seq2, "ACCGCTACCA");
assert_eq!(&sequence_bundle.query_seq2, "ACGGTTACCA");
}
#[test]
fn test_solve_compare_region_ambiguous() {
let reference_genome = generate_simple_reference();
let coordinates = Coordinates::new("mock_chr1".to_string(), 0, 10); let truth_variants = vec![
Variant::new_snv(0, 4, b"T".to_vec(), b"C".to_vec()).unwrap()
];
let truth_zygosity = vec![
PhasedZygosity::HomozygousAlternate
];
let query_variants = vec![
Variant::new_snv(0, 4, b"T".to_vec(), b"C".to_vec()).unwrap(),
Variant::new_snv(0, 4, b"T".to_vec(), b"A".to_vec()).unwrap()
];
let query_zygosity = vec![
PhasedZygosity::PhasedHet01,
PhasedZygosity::PhasedHet10
];
let problem = CompareRegion::new(
0, coordinates, truth_variants, truth_zygosity, query_variants, query_zygosity
).unwrap();
let result = solve_compare_region(&problem, &reference_genome, Default::default(), None).unwrap();
assert_eq!(result.total_ed(), 1); let group_metrics = result.group_metrics();
assert_eq!(*group_metrics.joint_metrics().gt(), SummaryGtMetrics::new(0, 1, 1, 1, 1, 0));
assert_eq!(*group_metrics.joint_metrics().hap(), SummaryMetrics::new(1, 1, 1, 1));
assert_eq!(*group_metrics.joint_metrics().basepair(), SummaryMetrics::new(3, 1, 3, 1)); assert_eq!(result.truth_variant_data(), &[
VariantMetrics::new(VariantSource::Truth, 2, 1).unwrap() ]);
assert_eq!(result.query_variant_data(), &[
VariantMetrics::new(VariantSource::Query, 1, 1).unwrap(), VariantMetrics::new(VariantSource::Query, 0, 1).unwrap() ]);
let sequence_bundle = result.sequence_bundle().unwrap();
assert_eq!(&sequence_bundle.ref_seq, "ACCGTTACCA");
assert_eq!(&sequence_bundle.truth_seq1, "ACCGCTACCA");
assert_eq!(&sequence_bundle.query_seq1, "ACCGATACCA"); assert_eq!(&sequence_bundle.truth_seq2, "ACCGCTACCA");
assert_eq!(&sequence_bundle.query_seq2, "ACCGCTACCA"); }
#[test]
fn test_solve_compare_region_skip_variants() {
let reference_genome = generate_simple_reference();
let region = Coordinates::new("mock_chr1".to_string(), 0, 10); let variants = vec![
Variant::new_deletion(0, 3, b"GT".to_vec(), b"G".to_vec()).unwrap(),
Variant::new_snv(0, 4, b"T".to_vec(), b"G".to_vec()).unwrap()
];
let zygosities = vec![
PhasedZygosity::PhasedHet01,
PhasedZygosity::HomozygousAlternate
];
let problem = CompareRegion::new(
0, region, variants.clone(), zygosities.clone(), variants, zygosities
).unwrap();
let result = solve_compare_region(&problem, &reference_genome, Default::default(), None).unwrap();
assert_eq!(result.total_ed(), 0); let group_metrics = result.group_metrics();
assert_eq!(*group_metrics.joint_metrics().gt(), SummaryGtMetrics::new(1, 1, 1, 1, 1, 1)); assert_eq!(*group_metrics.joint_metrics().hap(), SummaryMetrics::new(2, 1, 2, 1)); assert_eq!(*group_metrics.joint_metrics().basepair(), SummaryMetrics::new(4, 2, 4, 2)); assert_eq!(group_metrics.variant_metrics().get(&VariantType::Snv).unwrap().basepair(), &SummaryMetrics::new(3, 1, 3, 1)); assert_eq!(group_metrics.variant_metrics().get(&VariantType::Deletion).unwrap().basepair(), &SummaryMetrics::new(2, 0, 2, 0)); assert_eq!(result.truth_variant_data(), &[
VariantMetrics::new(VariantSource::Truth, 1, 1).unwrap(),
VariantMetrics::new(VariantSource::Truth, 2, 1).unwrap()
]);
assert_eq!(result.query_variant_data(), &[
VariantMetrics::toggle_source(&VariantMetrics::new(VariantSource::Truth, 1, 1).unwrap()),
VariantMetrics::toggle_source(&VariantMetrics::new(VariantSource::Truth, 2, 1).unwrap())
]);
let sequence_bundle = result.sequence_bundle().unwrap();
assert_eq!(&sequence_bundle.ref_seq, "ACCGTTACCA");
assert_eq!(&sequence_bundle.truth_seq1, "ACCGGTACCA");
assert_eq!(&sequence_bundle.query_seq1, "ACCGGTACCA");
assert_eq!(&sequence_bundle.truth_seq2, "ACCGTACCA");
assert_eq!(&sequence_bundle.query_seq2, "ACCGTACCA");
}
#[test]
fn test_perform_basepair_compare_simple() {
let result = perform_basepair_compare(
b"ACGTACGT",
b"ACGTACGT", b"ACCTAGGT" ).unwrap();
assert_eq!(result, SummaryMetrics::new(0, 0, 0, 2 * 2));
let result = perform_basepair_compare(
b"ACGTACGT",
b"ACCTAGGT", b"ACGTACGT" ).unwrap();
assert_eq!(result, SummaryMetrics::new(0, 2 * 2, 0, 0));
let result = perform_basepair_compare(
b"ACGTACGT",
b"ACCTAGGT", b"ACCTAGGT" ).unwrap();
assert_eq!(result, SummaryMetrics::new(2 * 2, 0, 2 * 2, 0));
}
#[test]
fn test_perform_basepair_compare_mixed() {
let result = perform_basepair_compare(
b"TAT",
b"TCT", b"TACT" ).unwrap();
assert_eq!(result, SummaryMetrics::new(1, 1, 1, 1)); }
#[test]
fn test_perform_basepair_compare_indels() {
let result = perform_basepair_compare(
b"TAT",
b"TAAAAT", b"TAAAT" ).unwrap();
assert_eq!(result, SummaryMetrics::new(2*2, 2*1, 2*2, 0));
let result = perform_basepair_compare(
b"TAT",
b"TAAAAT", b"TAAAAAT" ).unwrap();
assert_eq!(result, SummaryMetrics::new(2*3, 0, 2*3, 2*1));
}
}