use std::collections::BTreeMap;
use super::{normalize_variant, BedIndex, NormalizedVariant, VcfVariant, Zygosity};
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum VariantType {
Snv,
Indel,
}
#[derive(Debug, Clone)]
pub struct ComparisonReport {
pub total_truth: usize,
pub total_calls: usize,
pub true_positive: usize,
pub false_positive: usize,
pub false_negative: usize,
pub by_type: BTreeMap<VariantType, (usize, usize, usize)>, pub genotype_concordant: usize,
pub genotype_discordant: usize,
pub genotype_unknown: usize,
}
impl ComparisonReport {
pub fn genotype_concordance(&self) -> f64 {
let scorable = self.genotype_concordant + self.genotype_discordant;
if scorable == 0 {
0.0
} else {
self.genotype_concordant as f64 / scorable as f64
}
}
pub fn precision(&self) -> f64 {
if self.true_positive + self.false_positive == 0 {
0.0
} else {
self.true_positive as f64 / (self.true_positive + self.false_positive) as f64
}
}
pub fn recall(&self) -> f64 {
if self.true_positive + self.false_negative == 0 {
0.0
} else {
self.true_positive as f64 / (self.true_positive + self.false_negative) as f64
}
}
}
pub fn compare_callsets(
references: &BTreeMap<String, Vec<u8>>,
calls: &[VcfVariant],
truth: &[VcfVariant],
bed: Option<&BedIndex>,
) -> Result<ComparisonReport, anyhow::Error> {
let mut calls_map: BTreeMap<NormalizedVariant, Option<Zygosity>> = BTreeMap::new();
let mut truth_map: BTreeMap<NormalizedVariant, Option<Zygosity>> = BTreeMap::new();
for v in calls {
if let Some(bed) = bed {
if !bed.contains(&v.chrom, v.pos0) {
continue;
}
}
calls_map.insert(
normalize_variant(reference_for(references, v)?, v)?,
v.genotype,
);
}
for v in truth {
if let Some(bed) = bed {
if !bed.contains(&v.chrom, v.pos0) {
continue;
}
}
truth_map.insert(
normalize_variant(reference_for(references, v)?, v)?,
v.genotype,
);
}
let mut tp = 0usize;
let mut fp = 0usize;
let mut fn_ = 0usize;
let mut gt_concordant = 0usize;
let mut gt_discordant = 0usize;
let mut gt_unknown = 0usize;
let mut by_type: BTreeMap<VariantType, (usize, usize, usize)> = BTreeMap::new();
for (v, call_gt) in calls_map.iter() {
let vt = variant_type(v);
if let Some(truth_gt) = truth_map.get(v) {
tp += 1;
by_type.entry(vt).or_insert((0, 0, 0)).0 += 1;
match (call_gt, truth_gt) {
(Some(c), Some(t)) if c == t => gt_concordant += 1,
(Some(_), Some(_)) => gt_discordant += 1,
_ => gt_unknown += 1,
}
} else {
fp += 1;
by_type.entry(vt).or_insert((0, 0, 0)).1 += 1;
}
}
for v in truth_map.keys() {
if !calls_map.contains_key(v) {
fn_ += 1;
by_type.entry(variant_type(v)).or_insert((0, 0, 0)).2 += 1;
}
}
Ok(ComparisonReport {
total_truth: truth_map.len(),
total_calls: calls_map.len(),
true_positive: tp,
false_positive: fp,
false_negative: fn_,
by_type,
genotype_concordant: gt_concordant,
genotype_discordant: gt_discordant,
genotype_unknown: gt_unknown,
})
}
fn reference_for<'a>(
references: &'a BTreeMap<String, Vec<u8>>,
v: &VcfVariant,
) -> Result<&'a [u8], anyhow::Error> {
references.get(&v.chrom).map(Vec::as_slice).ok_or_else(|| {
let available: Vec<&str> = references.keys().map(String::as_str).collect();
anyhow::anyhow!(
"variant on contig '{}' (pos {}) has no matching sequence in the reference FASTA — \
check the contig naming scheme (e.g. UCSC 'chr1' vs Ensembl '1'). \
Reference contigs: [{}]",
v.chrom,
v.pos0 + 1,
available.join(", ")
)
})
}
fn variant_type(v: &NormalizedVariant) -> VariantType {
if v.reference.len() == 1 && v.alternate.len() == 1 {
VariantType::Snv
} else {
VariantType::Indel
}
}
#[cfg(test)]
mod gt_concordance_tests {
use super::*;
use crate::genomics::eval::Zygosity;
fn vv(pos1: u32, refb: &str, alt: &str, gt: Option<Zygosity>) -> VcfVariant {
VcfVariant {
chrom: "chr1".to_string(),
pos0: pos1 - 1,
reference: refb.as_bytes().to_vec(),
alternate: alt.as_bytes().to_vec(),
qual: None,
filter: ".".to_string(),
info: BTreeMap::new(),
genotype: gt,
}
}
fn refs() -> BTreeMap<String, Vec<u8>> {
BTreeMap::from([("chr1".to_string(), b"ACGTACGTAC".to_vec())])
}
#[test]
fn het_called_as_hom_is_a_detection_tp_but_a_genotype_error() {
let truth = vec![vv(5, "A", "C", Some(Zygosity::Het))];
let calls = vec![vv(5, "A", "C", Some(Zygosity::Hom))]; let r = compare_callsets(&refs(), &calls, &truth, None).unwrap();
assert_eq!(r.true_positive, 1, "detection still matches");
assert_eq!(r.false_positive, 0);
assert_eq!(r.false_negative, 0);
assert_eq!(r.genotype_concordant, 0);
assert_eq!(r.genotype_discordant, 1);
assert_eq!(r.genotype_concordance(), 0.0);
}
#[test]
fn matching_genotype_is_concordant() {
let truth = vec![vv(5, "A", "C", Some(Zygosity::Het))];
let calls = vec![vv(5, "A", "C", Some(Zygosity::Het))];
let r = compare_callsets(&refs(), &calls, &truth, None).unwrap();
assert_eq!((r.genotype_concordant, r.genotype_discordant), (1, 0));
assert_eq!(r.genotype_concordance(), 1.0);
}
#[test]
fn unknown_genotype_is_counted_separately_not_as_error() {
let truth = vec![vv(5, "A", "C", None)]; let calls = vec![vv(5, "A", "C", Some(Zygosity::Het))];
let r = compare_callsets(&refs(), &calls, &truth, None).unwrap();
assert_eq!(r.true_positive, 1);
assert_eq!(
(
r.genotype_concordant,
r.genotype_discordant,
r.genotype_unknown
),
(0, 0, 1)
);
}
}