pub mod config;
mod download;
mod schema;
use std::path::PathBuf;
use std::sync::Mutex;
use anyhow::{Context, Result};
use rusqlite::Connection;
use crate::io::vcf::VcfRecord;
use crate::variant::{AnnotatedVariant, ClinVarData, GnomadData, PharmgkbData, VariantQuery};
#[derive(Clone, Debug, clap::ValueEnum)]
pub enum DbTier {
Lite,
Standard,
Full,
}
pub struct Database {
conn: Mutex<Connection>,
}
impl Database {
pub fn open() -> Result<Self> {
let path = db_path();
if !path.exists() {
anyhow::bail!(
"No database found at {}. Run `genome db install` first.",
path.display()
);
}
Self::open_at(&path)
}
pub fn open_at(path: &std::path::Path) -> Result<Self> {
if !path.exists() {
anyhow::bail!("No database found at {}.", path.display());
}
let conn = Connection::open(path)
.with_context(|| format!("Failed to open database at {}", path.display()))?;
conn.execute_batch(
"PRAGMA journal_mode = WAL;
PRAGMA synchronous = NORMAL;
PRAGMA cache_size = -64000;
PRAGMA mmap_size = 268435456;",
)?;
Ok(Self {
conn: Mutex::new(conn),
})
}
pub fn open_or_create() -> Result<Self> {
let path = db_path();
if let Some(parent) = path.parent() {
std::fs::create_dir_all(parent)?;
}
let conn = Connection::open(&path)?;
conn.execute_batch(
"PRAGMA journal_mode = WAL;
PRAGMA synchronous = NORMAL;
PRAGMA cache_size = -128000;",
)?;
schema::create_tables(&conn)?;
Ok(Self {
conn: Mutex::new(conn),
})
}
pub fn query(&self, query: &VariantQuery) -> Result<Vec<AnnotatedVariant>> {
match query {
VariantQuery::Rsid(rsid) => self.query_by_rsid(rsid),
VariantQuery::Coordinates {
chrom,
pos,
r#ref,
alt,
} => self.query_by_coordinates(chrom, *pos, r#ref, alt),
VariantQuery::Gene(gene) => self.query_by_gene(gene),
VariantQuery::Hgvs(hgvs) => self.query_by_hgvs(hgvs),
}
}
pub fn annotate_vcf_record(&self, record: &VcfRecord) -> Result<Option<AnnotatedVariant>> {
let results =
self.query_by_coordinates(&record.chrom, record.pos, &record.reference, &record.alt)?;
Ok(results.into_iter().next())
}
pub fn connection(&self) -> std::sync::MutexGuard<'_, Connection> {
self.conn.lock().expect("database mutex poisoned")
}
fn query_by_rsid(&self, rsid: &str) -> Result<Vec<AnnotatedVariant>> {
let normalized = if rsid.starts_with("rs") {
rsid.to_string()
} else {
format!("rs{rsid}")
};
let conn = self.connection();
let mut stmt = conn.prepare_cached(VARIANT_QUERY_SQL)?;
let variants = stmt
.query_map(rusqlite::params![normalized], |row| {
Ok(row_to_variant(row))
})?
.collect::<Result<Vec<_>, _>>()?;
Ok(variants)
}
fn query_by_coordinates(
&self,
chrom: &str,
pos: u64,
reference: &str,
alt: &str,
) -> Result<Vec<AnnotatedVariant>> {
let conn = self.connection();
let mut stmt = conn.prepare_cached(
"SELECT v.id, v.chrom, v.pos, v.ref, v.alt, v.rsid, v.gene, v.assembly,
c.significance, c.review_stars, c.conditions, c.last_reviewed, c.submission_count,
g.af_global, g.af_afr, g.af_amr, g.af_asj, g.af_eas, g.af_fin, g.af_nfe, g.af_sas,
p.drug, p.phenotype, p.evidence_level
FROM variants v
LEFT JOIN clinvar c ON c.variant_id = v.id
LEFT JOIN gnomad_cache g ON g.variant_id = v.id
LEFT JOIN pharmgkb p ON p.variant_id = v.id
WHERE v.chrom = ?1 AND v.pos = ?2 AND v.ref = ?3 AND v.alt = ?4",
)?;
let variants = stmt
.query_map(rusqlite::params![chrom, pos as i64, reference, alt], |row| {
Ok(row_to_variant(row))
})?
.collect::<Result<Vec<_>, _>>()?;
Ok(variants)
}
fn query_by_gene(&self, gene: &str) -> Result<Vec<AnnotatedVariant>> {
let conn = self.connection();
let mut stmt = conn.prepare_cached(
"SELECT v.id, v.chrom, v.pos, v.ref, v.alt, v.rsid, v.gene, v.assembly,
c.significance, c.review_stars, c.conditions, c.last_reviewed, c.submission_count,
g.af_global, g.af_afr, g.af_amr, g.af_asj, g.af_eas, g.af_fin, g.af_nfe, g.af_sas,
p.drug, p.phenotype, p.evidence_level
FROM variants v
LEFT JOIN clinvar c ON c.variant_id = v.id
LEFT JOIN gnomad_cache g ON g.variant_id = v.id
LEFT JOIN pharmgkb p ON p.variant_id = v.id
WHERE v.gene = ?1 AND c.significance IN ('Pathogenic', 'Likely pathogenic')
ORDER BY c.review_stars DESC
LIMIT 100",
)?;
let variants = stmt
.query_map([&gene.to_uppercase()], |row| Ok(row_to_variant(row)))?
.collect::<Result<Vec<_>, _>>()?;
Ok(variants)
}
fn query_by_hgvs(&self, _hgvs: &str) -> Result<Vec<AnnotatedVariant>> {
anyhow::bail!("HGVS query is not yet implemented. Use rsID or coordinates instead.")
}
}
const VARIANT_QUERY_SQL: &str =
"SELECT v.id, v.chrom, v.pos, v.ref, v.alt, v.rsid, v.gene, v.assembly,
c.significance, c.review_stars, c.conditions, c.last_reviewed, c.submission_count,
g.af_global, g.af_afr, g.af_amr, g.af_asj, g.af_eas, g.af_fin, g.af_nfe, g.af_sas,
p.drug, p.phenotype, p.evidence_level
FROM variants v
LEFT JOIN clinvar c ON c.variant_id = v.id
LEFT JOIN gnomad_cache g ON g.variant_id = v.id
LEFT JOIN pharmgkb p ON p.variant_id = v.id
WHERE v.rsid = ?1";
fn row_to_variant(row: &rusqlite::Row) -> AnnotatedVariant {
let clinvar = row
.get::<_, Option<String>>(8)
.ok()
.flatten()
.map(|sig| ClinVarData {
significance: sig,
review_stars: row.get(9).unwrap_or(0),
conditions: row
.get::<_, Option<String>>(10)
.ok()
.flatten()
.unwrap_or_default(),
last_reviewed: row.get::<_, Option<String>>(11).ok().flatten(),
submission_count: row.get(12).unwrap_or(0),
});
let gnomad = row
.get::<_, Option<f64>>(13)
.ok()
.flatten()
.map(|af| GnomadData {
af_global: af,
af_afr: row.get(14).unwrap_or(0.0),
af_amr: row.get(15).unwrap_or(0.0),
af_asj: row.get(16).unwrap_or(0.0),
af_eas: row.get(17).unwrap_or(0.0),
af_fin: row.get(18).unwrap_or(0.0),
af_nfe: row.get(19).unwrap_or(0.0),
af_sas: row.get(20).unwrap_or(0.0),
});
let pharmgkb = row
.get::<_, Option<String>>(21)
.ok()
.flatten()
.map(|drug| PharmgkbData {
drug,
phenotype: row.get::<_, Option<String>>(22).ok().flatten(),
evidence_level: row
.get::<_, Option<String>>(23)
.ok()
.flatten()
.unwrap_or_default(),
});
AnnotatedVariant {
chrom: row.get(1).unwrap_or_default(),
pos: row.get::<_, i64>(2).unwrap_or(0) as u64,
reference: row.get(3).unwrap_or_default(),
alt: row.get(4).unwrap_or_default(),
rsid: row.get(5).ok(),
gene: row.get(6).ok(),
assembly: row.get(7).unwrap_or_else(|_| "GRCh38".to_string()),
clinvar,
gnomad,
pharmgkb,
}
}
fn db_path() -> PathBuf {
dirs::data_dir()
.unwrap_or_else(|| PathBuf::from("."))
.join("genome-sh")
.join("variants.db")
}
pub async fn install(tier: DbTier) -> Result<()> {
println!("Installing {:?} database...", tier);
let db = Database::open_or_create()?;
match tier {
DbTier::Lite => {
download::download_clinvar(&db).await?;
}
DbTier::Standard => {
download::download_clinvar(&db).await?;
download::download_gnomad(&db).await?;
}
DbTier::Full => {
download::download_clinvar(&db).await?;
download::download_gnomad(&db).await?;
download::download_dbsnp(&db).await?;
download::download_pharmgkb(&db).await?;
}
}
println!("Database installed successfully.");
status()?;
Ok(())
}
pub async fn update() -> Result<()> {
eprintln!("Database update is not yet implemented.");
Ok(())
}
pub fn status() -> Result<()> {
let path = db_path();
if !path.exists() {
println!("No database installed.");
println!("Run `genome db install` to get started.");
return Ok(());
}
let metadata = std::fs::metadata(&path)?;
let size_mb = metadata.len() as f64 / 1_048_576.0;
println!("Database: {}", path.display());
println!("Size: {size_mb:.1} MB");
let db = Database::open()?;
let conn = db.connection();
let variant_count: i64 = conn
.query_row("SELECT COUNT(*) FROM variants", [], |row| row.get(0))?;
let clinvar_count: i64 = conn
.query_row("SELECT COUNT(*) FROM clinvar", [], |row| row.get(0))?;
println!("Variants: {variant_count}");
println!(" ClinVar annotations: {clinvar_count}");
Ok(())
}
pub fn remove(database: &str) -> Result<()> {
match database {
"all" => {
let path = db_path();
if path.exists() {
std::fs::remove_file(&path)?;
println!("Database removed.");
} else {
println!("No database found.");
}
}
other => {
eprintln!("Removing individual databases ({other}) is not yet implemented.");
eprintln!("Use `genome db remove all` to remove everything.");
}
}
Ok(())
}