rsomics_vcf_validate/
lib.rs1use std::fs::File;
2use std::io::{BufRead, BufReader};
3use std::path::Path;
4
5use rsomics_common::{Result, RsomicsError};
6
7pub struct VcfValidation {
8 pub variants: u64,
9 pub errors: Vec<String>,
10 pub is_valid: bool,
11 pub has_header: bool,
12}
13
14pub fn validate_vcf(input: &Path) -> Result<VcfValidation> {
15 let file = File::open(input)
16 .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", input.display())))?;
17 let reader = BufReader::new(file);
18 let mut variants: u64 = 0;
19 let mut errors: Vec<String> = Vec::new();
20 let mut line_num: u64 = 0;
21 let mut has_header = false;
22 let mut has_column_line = false;
23
24 for line in reader.lines() {
25 let line = line.map_err(RsomicsError::Io)?;
26 line_num += 1;
27
28 if line.starts_with("##") {
29 has_header = true;
30 continue;
31 }
32
33 if line.starts_with("#CHROM") {
34 has_column_line = true;
35 let cols: Vec<&str> = line.split('\t').collect();
36 if cols.len() < 8 {
37 errors.push(format!(
38 "line {line_num}: column header has {} fields, need >=8",
39 cols.len()
40 ));
41 }
42 continue;
43 }
44
45 if line.starts_with('#') {
46 continue;
47 }
48
49 let fields: Vec<&str> = line.split('\t').collect();
50 if fields.len() < 8 {
51 errors.push(format!(
52 "line {line_num}: variant has {} fields, need >=8",
53 fields.len()
54 ));
55 variants += 1;
56 continue;
57 }
58
59 if fields[1].parse::<u64>().is_err() {
60 errors.push(format!("line {line_num}: POS is not a valid integer"));
61 }
62
63 let ref_allele = fields[3];
64 if ref_allele.is_empty() || ref_allele == "." {
65 errors.push(format!("line {line_num}: REF allele is empty or '.'"));
66 }
67
68 variants += 1;
69 }
70
71 if !has_column_line {
72 errors.push("#CHROM column header line not found".to_string());
73 }
74
75 let is_valid = errors.is_empty();
76 Ok(VcfValidation {
77 variants,
78 errors,
79 is_valid,
80 has_header,
81 })
82}