genome-sh 0.1.0

The jq of genomics. Fast, local, human-readable variant analysis.
use std::collections::HashMap;
use std::path::Path;

use anyhow::Result;

use crate::db::Database;
use crate::io::vcf::VcfReader;
use crate::output::Format;
use crate::variant::AnnotatedVariant;

/// Result of comparing two VCF files.
pub struct VariantComparison {
    pub shared_count: u64,
    pub unique_a_count: u64,
    pub unique_b_count: u64,
    pub total_a: u64,
    pub total_b: u64,
    pub clinical_differences: Vec<ClinicalDifference>,
    pub kinship_estimate: Option<f64>,
}

pub struct ClinicalDifference {
    pub variant: AnnotatedVariant,
    pub genotype_a: String,
    pub genotype_b: String,
}

/// Key for variant deduplication.
#[derive(Hash, Eq, PartialEq)]
struct VariantKey {
    chrom: String,
    pos: u64,
    reference: String,
    alt: String,
}

impl VariantComparison {
    /// Run a comparison between two VCF files.
    pub async fn run(
        file_a: &Path,
        file_b: &Path,
        db: &Database,
        kinship: bool,
        clinical_only: bool,
    ) -> Result<Self> {
        // Load variants from file A into a hashmap.
        let variants_a = load_variants(file_a).await?;
        let total_a = variants_a.len() as u64;

        // Stream file B and compare.
        let mut reader_b = VcfReader::open(file_b).await?;
        let mut shared_count = 0u64;
        let mut unique_b_count = 0u64;
        let mut total_b = 0u64;
        let mut seen_in_a = HashMap::new();
        let mut clinical_differences = Vec::new();

        while let Some(record) = reader_b.next_record().await? {
            total_b += 1;
            let key = VariantKey {
                chrom: record.chrom.clone(),
                pos: record.pos,
                reference: record.reference.clone(),
                alt: record.alt.clone(),
            };

            if variants_a.contains_key(&key) {
                shared_count += 1;
                seen_in_a.insert(key, true);
            } else {
                unique_b_count += 1;

                // Check if this unique variant is clinically significant.
                if clinical_only || clinical_differences.len() < 100 {
                    if let Some(annotated) = db.annotate_vcf_record(&record)? {
                        if is_clinically_significant(&annotated) {
                            clinical_differences.push(ClinicalDifference {
                                variant: annotated,
                                genotype_a: "0/0".to_string(),
                                genotype_b: "0/1".to_string(),
                            });
                        }
                    }
                }
            }
        }

        let unique_a_count = total_a - shared_count;

        // Simple kinship estimate based on shared variant proportion.
        let kinship_estimate = if kinship {
            let total = total_a.max(total_b) as f64;
            if total > 0.0 {
                Some(shared_count as f64 / total)
            } else {
                None
            }
        } else {
            None
        };

        Ok(Self {
            shared_count,
            unique_a_count,
            unique_b_count,
            total_a,
            total_b,
            clinical_differences,
            kinship_estimate,
        })
    }

    pub fn print(&self, format: Format) -> Result<()> {
        match format {
            Format::Json => {
                println!(
                    "{}",
                    serde_json::json!({
                        "shared": self.shared_count,
                        "unique_a": self.unique_a_count,
                        "unique_b": self.unique_b_count,
                        "total_a": self.total_a,
                        "total_b": self.total_b,
                        "kinship": self.kinship_estimate,
                        "clinical_differences": self.clinical_differences.len(),
                    })
                );
            }
            _ => {
                println!("Shared variants:    {}", self.shared_count);
                println!("Unique to file A:   {}", self.unique_a_count);
                println!("Unique to file B:   {}", self.unique_b_count);
                println!();

                if let Some(kinship) = self.kinship_estimate {
                    println!("Kinship estimate:   {kinship:.3}");
                    println!(
                        "Likely relationship: {}",
                        estimate_relationship(kinship)
                    );
                    println!();
                }

                if !self.clinical_differences.is_empty() {
                    println!(
                        "Clinically significant differences: {}",
                        self.clinical_differences.len()
                    );
                    for diff in &self.clinical_differences {
                        let sig = diff
                            .variant
                            .clinvar
                            .as_ref()
                            .map(|c| c.significance.as_str())
                            .unwrap_or("Unknown");
                        let gene = diff.variant.gene.as_deref().unwrap_or("?");
                        let rsid = diff
                            .variant
                            .rsid
                            .as_deref()
                            .unwrap_or(".");
                        println!(
                            "  {rsid:<14} {gene:<10} {sig:<20} A={} B={}",
                            diff.genotype_a, diff.genotype_b
                        );
                    }
                }
            }
        }

        Ok(())
    }
}

async fn load_variants(path: &Path) -> Result<HashMap<VariantKey, bool>> {
    let mut reader = VcfReader::open(path).await?;
    let mut variants = HashMap::new();

    while let Some(record) = reader.next_record().await? {
        let key = VariantKey {
            chrom: record.chrom,
            pos: record.pos,
            reference: record.reference,
            alt: record.alt,
        };
        variants.insert(key, true);
    }

    Ok(variants)
}

fn is_clinically_significant(variant: &AnnotatedVariant) -> bool {
    variant
        .clinvar
        .as_ref()
        .map(|c| {
            c.significance.contains("Pathogenic")
                || c.significance.contains("Likely pathogenic")
        })
        .unwrap_or(false)
}

fn estimate_relationship(kinship: f64) -> &'static str {
    if kinship > 0.95 {
        "Identical / Monozygotic twins"
    } else if kinship > 0.45 {
        "Parent-Child / Full siblings"
    } else if kinship > 0.20 {
        "Half siblings / Grandparent-Grandchild"
    } else if kinship > 0.08 {
        "First cousins"
    } else if kinship > 0.03 {
        "Distant relatives"
    } else {
        "Unrelated"
    }
}