genome-sh 0.1.0

The jq of genomics. Fast, local, human-readable variant analysis.
pub mod comparison;

use anyhow::Result;
use serde::{Deserialize, Serialize};

/// Parsed query input. Determines which lookup strategy to use.
#[derive(Debug)]
pub enum VariantQuery {
    Rsid(String),
    Coordinates {
        chrom: String,
        pos: u64,
        r#ref: String,
        alt: String,
    },
    Gene(String),
    Hgvs(String),
}

impl VariantQuery {
    /// Parse a raw query string into a structured query.
    pub fn parse(raw: &str) -> Result<Self> {
        let trimmed = raw.trim();

        // rsID: rs12345 or just 12345
        if trimmed.starts_with("rs") && trimmed[2..].chars().all(|c| c.is_ascii_digit()) {
            return Ok(Self::Rsid(trimmed.to_string()));
        }

        // Coordinates: chr17:43092919 or chr17:43092919:G:A
        if trimmed.contains(':') && !trimmed.contains('.') {
            let parts: Vec<&str> = trimmed.split(':').collect();
            if parts.len() >= 2 {
                let chrom = normalize_chrom(parts[0]);
                if let Ok(pos) = parts[1].parse::<u64>() {
                    let reference = parts.get(2).unwrap_or(&"").to_string();
                    let alt = parts.get(3).unwrap_or(&"").to_string();
                    return Ok(Self::Coordinates {
                        chrom,
                        pos,
                        r#ref: reference,
                        alt,
                    });
                }
            }
        }

        // HGVS: contains a dot (e.g., NM_007294.4:c.5266dupC)
        if trimmed.contains('.') && trimmed.contains(':') {
            return Ok(Self::Hgvs(trimmed.to_string()));
        }

        // Gene name: all uppercase letters, 2-10 characters.
        if trimmed.len() >= 2
            && trimmed.len() <= 15
            && trimmed
                .chars()
                .all(|c| c.is_ascii_uppercase() || c.is_ascii_digit())
        {
            return Ok(Self::Gene(trimmed.to_string()));
        }

        anyhow::bail!(
            "Cannot parse query: {trimmed}\n\
             Supported formats:\n\
             - rsID: rs12345\n\
             - Coordinates: chr17:43092919:G:A\n\
             - Gene: BRCA1\n\
             - HGVS: NM_007294.4:c.5266dupC"
        )
    }
}

/// A variant with annotations from all available databases.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnnotatedVariant {
    pub chrom: String,
    pub pos: u64,
    pub reference: String,
    pub alt: String,
    pub rsid: Option<String>,
    pub gene: Option<String>,
    pub assembly: String,
    pub clinvar: Option<ClinVarData>,
    pub gnomad: Option<GnomadData>,
    pub pharmgkb: Option<PharmgkbData>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ClinVarData {
    pub significance: String,
    pub review_stars: i32,
    pub conditions: String,
    pub last_reviewed: Option<String>,
    pub submission_count: i32,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GnomadData {
    pub af_global: f64,
    pub af_afr: f64,
    pub af_amr: f64,
    pub af_asj: f64,
    pub af_eas: f64,
    pub af_fin: f64,
    pub af_nfe: f64,
    pub af_sas: f64,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PharmgkbData {
    pub drug: String,
    pub phenotype: Option<String>,
    pub evidence_level: String,
}

fn normalize_chrom(chrom: &str) -> String {
    if chrom.starts_with("chr") {
        chrom.to_string()
    } else {
        format!("chr{chrom}")
    }
}