rsomics-vcf-validate 0.1.0

Validate VCF format integrity
Documentation
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;

use rsomics_common::{Result, RsomicsError};

pub struct VcfValidation {
    pub variants: u64,
    pub errors: Vec<String>,
    pub is_valid: bool,
    pub has_header: bool,
}

pub fn validate_vcf(input: &Path) -> Result<VcfValidation> {
    let file = File::open(input)
        .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", input.display())))?;
    let reader = BufReader::new(file);
    let mut variants: u64 = 0;
    let mut errors: Vec<String> = Vec::new();
    let mut line_num: u64 = 0;
    let mut has_header = false;
    let mut has_column_line = false;

    for line in reader.lines() {
        let line = line.map_err(RsomicsError::Io)?;
        line_num += 1;

        if line.starts_with("##") {
            has_header = true;
            continue;
        }

        if line.starts_with("#CHROM") {
            has_column_line = true;
            let cols: Vec<&str> = line.split('\t').collect();
            if cols.len() < 8 {
                errors.push(format!(
                    "line {line_num}: column header has {} fields, need >=8",
                    cols.len()
                ));
            }
            continue;
        }

        if line.starts_with('#') {
            continue;
        }

        let fields: Vec<&str> = line.split('\t').collect();
        if fields.len() < 8 {
            errors.push(format!(
                "line {line_num}: variant has {} fields, need >=8",
                fields.len()
            ));
            variants += 1;
            continue;
        }

        if fields[1].parse::<u64>().is_err() {
            errors.push(format!("line {line_num}: POS is not a valid integer"));
        }

        let ref_allele = fields[3];
        if ref_allele.is_empty() || ref_allele == "." {
            errors.push(format!("line {line_num}: REF allele is empty or '.'"));
        }

        variants += 1;
    }

    if !has_column_line {
        errors.push("#CHROM column header line not found".to_string());
    }

    let is_valid = errors.is_empty();
    Ok(VcfValidation {
        variants,
        errors,
        is_valid,
        has_header,
    })
}