Skip to main content

rsomics_vcf_validate/
lib.rs

1use std::fs::File;
2use std::io::{BufRead, BufReader};
3use std::path::Path;
4
5use rsomics_common::{Result, RsomicsError};
6
7pub struct VcfValidation {
8    pub variants: u64,
9    pub errors: Vec<String>,
10    pub is_valid: bool,
11    pub has_header: bool,
12}
13
14pub fn validate_vcf(input: &Path) -> Result<VcfValidation> {
15    let file = File::open(input)
16        .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", input.display())))?;
17    let reader = BufReader::new(file);
18    let mut variants: u64 = 0;
19    let mut errors: Vec<String> = Vec::new();
20    let mut line_num: u64 = 0;
21    let mut has_header = false;
22    let mut has_column_line = false;
23
24    for line in reader.lines() {
25        let line = line.map_err(RsomicsError::Io)?;
26        line_num += 1;
27
28        if line.starts_with("##") {
29            has_header = true;
30            continue;
31        }
32
33        if line.starts_with("#CHROM") {
34            has_column_line = true;
35            let cols: Vec<&str> = line.split('\t').collect();
36            if cols.len() < 8 {
37                errors.push(format!(
38                    "line {line_num}: column header has {} fields, need >=8",
39                    cols.len()
40                ));
41            }
42            continue;
43        }
44
45        if line.starts_with('#') {
46            continue;
47        }
48
49        let fields: Vec<&str> = line.split('\t').collect();
50        if fields.len() < 8 {
51            errors.push(format!(
52                "line {line_num}: variant has {} fields, need >=8",
53                fields.len()
54            ));
55            variants += 1;
56            continue;
57        }
58
59        if fields[1].parse::<u64>().is_err() {
60            errors.push(format!("line {line_num}: POS is not a valid integer"));
61        }
62
63        let ref_allele = fields[3];
64        if ref_allele.is_empty() || ref_allele == "." {
65            errors.push(format!("line {line_num}: REF allele is empty or '.'"));
66        }
67
68        variants += 1;
69    }
70
71    if !has_column_line {
72        errors.push("#CHROM column header line not found".to_string());
73    }
74
75    let is_valid = errors.is_empty();
76    Ok(VcfValidation {
77        variants,
78        errors,
79        is_valid,
80        has_header,
81    })
82}