genome-sh 0.1.0

The jq of genomics. Fast, local, human-readable variant analysis.
pub mod config;
mod download;
mod schema;

use std::path::PathBuf;
use std::sync::Mutex;

use anyhow::{Context, Result};
use rusqlite::Connection;

use crate::io::vcf::VcfRecord;
use crate::variant::{AnnotatedVariant, ClinVarData, GnomadData, PharmgkbData, VariantQuery};

/// Database tiers control which variant sources are included.
#[derive(Clone, Debug, clap::ValueEnum)]
pub enum DbTier {
    /// ClinVar only (~170 MB download, ~1 GB on disk).
    Lite,
    /// ClinVar + gnomAD exomes (~5 GB download, ~10 GB on disk).
    Standard,
    /// ClinVar + gnomAD + dbSNP + PharmGKB + UniProt (~15 GB download, ~25 GB on disk).
    Full,
}

/// Handle to the local variant database.
/// Uses a Mutex so it can be shared across async tasks in the API server.
pub struct Database {
    conn: Mutex<Connection>,
}

impl Database {
    /// Open the database at the default location.
    pub fn open() -> Result<Self> {
        let path = db_path();
        if !path.exists() {
            anyhow::bail!(
                "No database found at {}. Run `genome db install` first.",
                path.display()
            );
        }
        Self::open_at(&path)
    }

    /// Open the database at a specific path.
    pub fn open_at(path: &std::path::Path) -> Result<Self> {
        if !path.exists() {
            anyhow::bail!("No database found at {}.", path.display());
        }

        let conn = Connection::open(path)
            .with_context(|| format!("Failed to open database at {}", path.display()))?;

        conn.execute_batch(
            "PRAGMA journal_mode = WAL;
             PRAGMA synchronous = NORMAL;
             PRAGMA cache_size = -64000;
             PRAGMA mmap_size = 268435456;",
        )?;

        Ok(Self {
            conn: Mutex::new(conn),
        })
    }

    /// Open or create the database for writing (used during `db install`).
    pub fn open_or_create() -> Result<Self> {
        let path = db_path();
        if let Some(parent) = path.parent() {
            std::fs::create_dir_all(parent)?;
        }

        let conn = Connection::open(&path)?;

        conn.execute_batch(
            "PRAGMA journal_mode = WAL;
             PRAGMA synchronous = NORMAL;
             PRAGMA cache_size = -128000;",
        )?;

        schema::create_tables(&conn)?;

        Ok(Self {
            conn: Mutex::new(conn),
        })
    }

    /// Query variants by rsID, coordinates, HGVS, or gene name.
    pub fn query(&self, query: &VariantQuery) -> Result<Vec<AnnotatedVariant>> {
        match query {
            VariantQuery::Rsid(rsid) => self.query_by_rsid(rsid),
            VariantQuery::Coordinates {
                chrom,
                pos,
                r#ref,
                alt,
            } => self.query_by_coordinates(chrom, *pos, r#ref, alt),
            VariantQuery::Gene(gene) => self.query_by_gene(gene),
            VariantQuery::Hgvs(hgvs) => self.query_by_hgvs(hgvs),
        }
    }

    /// Annotate a VCF record against the database.
    pub fn annotate_vcf_record(&self, record: &VcfRecord) -> Result<Option<AnnotatedVariant>> {
        let results =
            self.query_by_coordinates(&record.chrom, record.pos, &record.reference, &record.alt)?;
        Ok(results.into_iter().next())
    }

    /// Get access to the underlying connection (locks the mutex).
    /// Used by the API routes for raw queries.
    pub fn connection(&self) -> std::sync::MutexGuard<'_, Connection> {
        self.conn.lock().expect("database mutex poisoned")
    }

    fn query_by_rsid(&self, rsid: &str) -> Result<Vec<AnnotatedVariant>> {
        let normalized = if rsid.starts_with("rs") {
            rsid.to_string()
        } else {
            format!("rs{rsid}")
        };

        let conn = self.connection();
        let mut stmt = conn.prepare_cached(VARIANT_QUERY_SQL)?;

        let variants = stmt
            .query_map(rusqlite::params![normalized], |row| {
                Ok(row_to_variant(row))
            })?
            .collect::<Result<Vec<_>, _>>()?;

        Ok(variants)
    }

    fn query_by_coordinates(
        &self,
        chrom: &str,
        pos: u64,
        reference: &str,
        alt: &str,
    ) -> Result<Vec<AnnotatedVariant>> {
        let conn = self.connection();
        let mut stmt = conn.prepare_cached(
            "SELECT v.id, v.chrom, v.pos, v.ref, v.alt, v.rsid, v.gene, v.assembly,
                    c.significance, c.review_stars, c.conditions, c.last_reviewed, c.submission_count,
                    g.af_global, g.af_afr, g.af_amr, g.af_asj, g.af_eas, g.af_fin, g.af_nfe, g.af_sas,
                    p.drug, p.phenotype, p.evidence_level
             FROM variants v
             LEFT JOIN clinvar c ON c.variant_id = v.id
             LEFT JOIN gnomad_cache g ON g.variant_id = v.id
             LEFT JOIN pharmgkb p ON p.variant_id = v.id
             WHERE v.chrom = ?1 AND v.pos = ?2 AND v.ref = ?3 AND v.alt = ?4",
        )?;

        let variants = stmt
            .query_map(rusqlite::params![chrom, pos as i64, reference, alt], |row| {
                Ok(row_to_variant(row))
            })?
            .collect::<Result<Vec<_>, _>>()?;

        Ok(variants)
    }

    fn query_by_gene(&self, gene: &str) -> Result<Vec<AnnotatedVariant>> {
        let conn = self.connection();
        let mut stmt = conn.prepare_cached(
            "SELECT v.id, v.chrom, v.pos, v.ref, v.alt, v.rsid, v.gene, v.assembly,
                    c.significance, c.review_stars, c.conditions, c.last_reviewed, c.submission_count,
                    g.af_global, g.af_afr, g.af_amr, g.af_asj, g.af_eas, g.af_fin, g.af_nfe, g.af_sas,
                    p.drug, p.phenotype, p.evidence_level
             FROM variants v
             LEFT JOIN clinvar c ON c.variant_id = v.id
             LEFT JOIN gnomad_cache g ON g.variant_id = v.id
             LEFT JOIN pharmgkb p ON p.variant_id = v.id
             WHERE v.gene = ?1 AND c.significance IN ('Pathogenic', 'Likely pathogenic')
             ORDER BY c.review_stars DESC
             LIMIT 100",
        )?;

        let variants = stmt
            .query_map([&gene.to_uppercase()], |row| Ok(row_to_variant(row)))?
            .collect::<Result<Vec<_>, _>>()?;

        Ok(variants)
    }

    fn query_by_hgvs(&self, _hgvs: &str) -> Result<Vec<AnnotatedVariant>> {
        anyhow::bail!("HGVS query is not yet implemented. Use rsID or coordinates instead.")
    }
}

const VARIANT_QUERY_SQL: &str =
    "SELECT v.id, v.chrom, v.pos, v.ref, v.alt, v.rsid, v.gene, v.assembly,
            c.significance, c.review_stars, c.conditions, c.last_reviewed, c.submission_count,
            g.af_global, g.af_afr, g.af_amr, g.af_asj, g.af_eas, g.af_fin, g.af_nfe, g.af_sas,
            p.drug, p.phenotype, p.evidence_level
     FROM variants v
     LEFT JOIN clinvar c ON c.variant_id = v.id
     LEFT JOIN gnomad_cache g ON g.variant_id = v.id
     LEFT JOIN pharmgkb p ON p.variant_id = v.id
     WHERE v.rsid = ?1";

fn row_to_variant(row: &rusqlite::Row) -> AnnotatedVariant {
    let clinvar = row
        .get::<_, Option<String>>(8)
        .ok()
        .flatten()
        .map(|sig| ClinVarData {
            significance: sig,
            review_stars: row.get(9).unwrap_or(0),
            conditions: row
                .get::<_, Option<String>>(10)
                .ok()
                .flatten()
                .unwrap_or_default(),
            last_reviewed: row.get::<_, Option<String>>(11).ok().flatten(),
            submission_count: row.get(12).unwrap_or(0),
        });

    let gnomad = row
        .get::<_, Option<f64>>(13)
        .ok()
        .flatten()
        .map(|af| GnomadData {
            af_global: af,
            af_afr: row.get(14).unwrap_or(0.0),
            af_amr: row.get(15).unwrap_or(0.0),
            af_asj: row.get(16).unwrap_or(0.0),
            af_eas: row.get(17).unwrap_or(0.0),
            af_fin: row.get(18).unwrap_or(0.0),
            af_nfe: row.get(19).unwrap_or(0.0),
            af_sas: row.get(20).unwrap_or(0.0),
        });

    let pharmgkb = row
        .get::<_, Option<String>>(21)
        .ok()
        .flatten()
        .map(|drug| PharmgkbData {
            drug,
            phenotype: row.get::<_, Option<String>>(22).ok().flatten(),
            evidence_level: row
                .get::<_, Option<String>>(23)
                .ok()
                .flatten()
                .unwrap_or_default(),
        });

    AnnotatedVariant {
        chrom: row.get(1).unwrap_or_default(),
        pos: row.get::<_, i64>(2).unwrap_or(0) as u64,
        reference: row.get(3).unwrap_or_default(),
        alt: row.get(4).unwrap_or_default(),
        rsid: row.get(5).ok(),
        gene: row.get(6).ok(),
        assembly: row.get(7).unwrap_or_else(|_| "GRCh38".to_string()),
        clinvar,
        gnomad,
        pharmgkb,
    }
}

fn db_path() -> PathBuf {
    dirs::data_dir()
        .unwrap_or_else(|| PathBuf::from("."))
        .join("genome-sh")
        .join("variants.db")
}

pub async fn install(tier: DbTier) -> Result<()> {
    println!("Installing {:?} database...", tier);

    let db = Database::open_or_create()?;

    match tier {
        DbTier::Lite => {
            download::download_clinvar(&db).await?;
        }
        DbTier::Standard => {
            download::download_clinvar(&db).await?;
            download::download_gnomad(&db).await?;
        }
        DbTier::Full => {
            download::download_clinvar(&db).await?;
            download::download_gnomad(&db).await?;
            download::download_dbsnp(&db).await?;
            download::download_pharmgkb(&db).await?;
        }
    }

    println!("Database installed successfully.");
    status()?;

    Ok(())
}

pub async fn update() -> Result<()> {
    eprintln!("Database update is not yet implemented.");
    Ok(())
}

pub fn status() -> Result<()> {
    let path = db_path();
    if !path.exists() {
        println!("No database installed.");
        println!("Run `genome db install` to get started.");
        return Ok(());
    }

    let metadata = std::fs::metadata(&path)?;
    let size_mb = metadata.len() as f64 / 1_048_576.0;

    println!("Database: {}", path.display());
    println!("Size: {size_mb:.1} MB");

    let db = Database::open()?;
    let conn = db.connection();
    let variant_count: i64 = conn
        .query_row("SELECT COUNT(*) FROM variants", [], |row| row.get(0))?;
    let clinvar_count: i64 = conn
        .query_row("SELECT COUNT(*) FROM clinvar", [], |row| row.get(0))?;

    println!("Variants: {variant_count}");
    println!("  ClinVar annotations: {clinvar_count}");

    Ok(())
}

pub fn remove(database: &str) -> Result<()> {
    match database {
        "all" => {
            let path = db_path();
            if path.exists() {
                std::fs::remove_file(&path)?;
                println!("Database removed.");
            } else {
                println!("No database found.");
            }
        }
        other => {
            eprintln!("Removing individual databases ({other}) is not yet implemented.");
            eprintln!("Use `genome db remove all` to remove everything.");
        }
    }

    Ok(())
}