genome-sh 0.1.0

The jq of genomics. Fast, local, human-readable variant analysis.
use std::path::Path;

use anyhow::{Context, Result};
use tokio::io::BufReader;

/// A parsed VCF record (minimal representation for annotation).
#[derive(Debug, Clone)]
pub struct VcfRecord {
    pub chrom: String,
    pub pos: u64,
    pub id: Option<String>,
    pub reference: String,
    pub alt: String,
    pub qual: Option<f64>,
    pub filter: Option<String>,
    pub info: String,
}

/// Streaming VCF reader. Reads records one at a time without loading the full file.
pub struct VcfReader {
    lines: tokio::io::Lines<BufReader<Box<dyn tokio::io::AsyncRead + Unpin + Send>>>,
}

impl VcfReader {
    /// Open a VCF file (supports .vcf and .vcf.gz via noodles bgzf).
    pub async fn open(path: &Path) -> Result<Self> {
        let reader: Box<dyn tokio::io::AsyncRead + Unpin + Send> = if path.to_string_lossy() == "-"
        {
            Box::new(tokio::io::stdin())
        } else {
            let file = tokio::fs::File::open(path)
                .await
                .with_context(|| format!("Cannot open {}", path.display()))?;

            if path.extension().is_some_and(|ext| ext == "gz") {
                // For gzipped VCF, use noodles bgzf reader.
                let bgzf_reader = noodles::bgzf::r#async::reader::Reader::new(file);
                Box::new(bgzf_reader)
            } else {
                Box::new(file)
            }
        };

        let buf_reader = BufReader::new(reader);
        let lines = tokio::io::AsyncBufReadExt::lines(buf_reader);

        Ok(Self { lines })
    }

    /// Read the next VCF record. Returns None at EOF.
    pub async fn next_record(&mut self) -> Result<Option<VcfRecord>> {
        loop {
            match self.lines.next_line().await? {
                None => return Ok(None),
                Some(line) => {
                    if line.starts_with('#') {
                        continue;
                    }
                    return parse_vcf_line(&line).map(Some);
                }
            }
        }
    }
}

fn parse_vcf_line(line: &str) -> Result<VcfRecord> {
    let fields: Vec<&str> = line.splitn(8, '\t').collect();
    if fields.len() < 5 {
        anyhow::bail!("Invalid VCF line: too few fields");
    }

    let chrom = normalize_chrom(fields[0]);
    let pos: u64 = fields[1].parse().context("Invalid position")?;
    let id = if fields[2] == "." {
        None
    } else {
        Some(fields[2].to_string())
    };
    let reference = fields[3].to_string();
    let alt = fields[4].to_string();
    let qual = fields.get(5).and_then(|q| q.parse().ok());
    let filter = fields.get(6).map(|f| f.to_string());
    let info = fields.get(7).unwrap_or(&"").to_string();

    Ok(VcfRecord {
        chrom,
        pos,
        id,
        reference,
        alt,
        qual,
        filter,
        info,
    })
}

fn normalize_chrom(chrom: &str) -> String {
    if chrom.starts_with("chr") {
        chrom.to_string()
    } else {
        format!("chr{chrom}")
    }
}