use std::collections::HashMap;
use std::path::Path;
use anyhow::Result;
use crate::db::Database;
use crate::io::vcf::VcfReader;
use crate::output::Format;
use crate::variant::AnnotatedVariant;
pub struct VariantComparison {
pub shared_count: u64,
pub unique_a_count: u64,
pub unique_b_count: u64,
pub total_a: u64,
pub total_b: u64,
pub clinical_differences: Vec<ClinicalDifference>,
pub kinship_estimate: Option<f64>,
}
pub struct ClinicalDifference {
pub variant: AnnotatedVariant,
pub genotype_a: String,
pub genotype_b: String,
}
#[derive(Hash, Eq, PartialEq)]
struct VariantKey {
chrom: String,
pos: u64,
reference: String,
alt: String,
}
impl VariantComparison {
pub async fn run(
file_a: &Path,
file_b: &Path,
db: &Database,
kinship: bool,
clinical_only: bool,
) -> Result<Self> {
let variants_a = load_variants(file_a).await?;
let total_a = variants_a.len() as u64;
let mut reader_b = VcfReader::open(file_b).await?;
let mut shared_count = 0u64;
let mut unique_b_count = 0u64;
let mut total_b = 0u64;
let mut seen_in_a = HashMap::new();
let mut clinical_differences = Vec::new();
while let Some(record) = reader_b.next_record().await? {
total_b += 1;
let key = VariantKey {
chrom: record.chrom.clone(),
pos: record.pos,
reference: record.reference.clone(),
alt: record.alt.clone(),
};
if variants_a.contains_key(&key) {
shared_count += 1;
seen_in_a.insert(key, true);
} else {
unique_b_count += 1;
if clinical_only || clinical_differences.len() < 100 {
if let Some(annotated) = db.annotate_vcf_record(&record)? {
if is_clinically_significant(&annotated) {
clinical_differences.push(ClinicalDifference {
variant: annotated,
genotype_a: "0/0".to_string(),
genotype_b: "0/1".to_string(),
});
}
}
}
}
}
let unique_a_count = total_a - shared_count;
let kinship_estimate = if kinship {
let total = total_a.max(total_b) as f64;
if total > 0.0 {
Some(shared_count as f64 / total)
} else {
None
}
} else {
None
};
Ok(Self {
shared_count,
unique_a_count,
unique_b_count,
total_a,
total_b,
clinical_differences,
kinship_estimate,
})
}
pub fn print(&self, format: Format) -> Result<()> {
match format {
Format::Json => {
println!(
"{}",
serde_json::json!({
"shared": self.shared_count,
"unique_a": self.unique_a_count,
"unique_b": self.unique_b_count,
"total_a": self.total_a,
"total_b": self.total_b,
"kinship": self.kinship_estimate,
"clinical_differences": self.clinical_differences.len(),
})
);
}
_ => {
println!("Shared variants: {}", self.shared_count);
println!("Unique to file A: {}", self.unique_a_count);
println!("Unique to file B: {}", self.unique_b_count);
println!();
if let Some(kinship) = self.kinship_estimate {
println!("Kinship estimate: {kinship:.3}");
println!(
"Likely relationship: {}",
estimate_relationship(kinship)
);
println!();
}
if !self.clinical_differences.is_empty() {
println!(
"Clinically significant differences: {}",
self.clinical_differences.len()
);
for diff in &self.clinical_differences {
let sig = diff
.variant
.clinvar
.as_ref()
.map(|c| c.significance.as_str())
.unwrap_or("Unknown");
let gene = diff.variant.gene.as_deref().unwrap_or("?");
let rsid = diff
.variant
.rsid
.as_deref()
.unwrap_or(".");
println!(
" {rsid:<14} {gene:<10} {sig:<20} A={} B={}",
diff.genotype_a, diff.genotype_b
);
}
}
}
}
Ok(())
}
}
async fn load_variants(path: &Path) -> Result<HashMap<VariantKey, bool>> {
let mut reader = VcfReader::open(path).await?;
let mut variants = HashMap::new();
while let Some(record) = reader.next_record().await? {
let key = VariantKey {
chrom: record.chrom,
pos: record.pos,
reference: record.reference,
alt: record.alt,
};
variants.insert(key, true);
}
Ok(variants)
}
fn is_clinically_significant(variant: &AnnotatedVariant) -> bool {
variant
.clinvar
.as_ref()
.map(|c| {
c.significance.contains("Pathogenic")
|| c.significance.contains("Likely pathogenic")
})
.unwrap_or(false)
}
fn estimate_relationship(kinship: f64) -> &'static str {
if kinship > 0.95 {
"Identical / Monozygotic twins"
} else if kinship > 0.45 {
"Parent-Child / Full siblings"
} else if kinship > 0.20 {
"Half siblings / Grandparent-Grandchild"
} else if kinship > 0.08 {
"First cousins"
} else if kinship > 0.03 {
"Distant relatives"
} else {
"Unrelated"
}
}