use std::collections::BTreeMap;
use thiserror::Error;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Zygosity {
Het,
Hom,
}
#[derive(Debug, Clone, PartialEq)]
pub struct VcfVariant {
pub chrom: String,
pub pos0: u32,
pub reference: Vec<u8>,
pub alternate: Vec<u8>,
pub qual: Option<f32>,
pub filter: String,
pub info: BTreeMap<String, String>,
pub genotype: Option<Zygosity>,
}
#[derive(Debug, Error)]
pub enum VcfParseError {
#[error("invalid VCF line {line}: {msg}")]
InvalidLine {
line: usize,
msg: String,
},
}
pub fn read_vcf_variants(contents: &str) -> Result<Vec<VcfVariant>, VcfParseError> {
let mut out = Vec::new();
for (i, line) in contents.lines().enumerate() {
let line_no = i + 1;
let trimmed = line.trim();
if trimmed.is_empty() || trimmed.starts_with('#') {
continue;
}
let cols: Vec<&str> = trimmed.split('\t').collect();
if cols.len() < 8 {
return Err(VcfParseError::InvalidLine {
line: line_no,
msg: "expected at least 8 tab-separated columns".to_string(),
});
}
let chrom = cols[0].to_string();
let pos1: u32 = cols[1].parse().map_err(|_| VcfParseError::InvalidLine {
line: line_no,
msg: "POS is not a u32".to_string(),
})?;
if pos1 == 0 {
return Err(VcfParseError::InvalidLine {
line: line_no,
msg: "POS must be 1-based".to_string(),
});
}
let pos0 = pos1 - 1;
let reference = cols[3].as_bytes().to_vec();
let qual = if cols[5] == "." {
None
} else {
Some(cols[5].parse().map_err(|_| VcfParseError::InvalidLine {
line: line_no,
msg: "QUAL is not a float".to_string(),
})?)
};
let filter = cols[6].to_string();
let info = parse_info(cols[7]);
let gt_indices = parse_gt_indices(cols.get(8).copied(), cols.get(9).copied());
for (alt_idx, alt) in cols[4].split(',').enumerate() {
let allele_index = (alt_idx + 1) as u8; let genotype = match >_indices {
None => None,
Some(idx) => match idx.iter().filter(|&&a| a == allele_index).count() {
0 => continue, 1 => Some(Zygosity::Het),
_ => Some(Zygosity::Hom),
},
};
out.push(VcfVariant {
chrom: chrom.clone(),
pos0,
reference: reference.clone(),
alternate: alt.as_bytes().to_vec(),
qual,
filter: filter.clone(),
info: info.clone(),
genotype,
});
}
}
Ok(out)
}
fn parse_gt_indices(format: Option<&str>, sample: Option<&str>) -> Option<Vec<u8>> {
let (format, sample) = (format?, sample?);
let gt_pos = format.split(':').position(|k| k == "GT")?;
let gt = sample.split(':').nth(gt_pos)?;
let indices: Vec<u8> = gt
.split(['/', '|'])
.filter_map(|a| a.parse::<u8>().ok()) .collect();
if indices.is_empty() {
None
} else {
Some(indices)
}
}
fn parse_info(info: &str) -> BTreeMap<String, String> {
let mut map = BTreeMap::new();
if info == "." {
return map;
}
for item in info.split(';') {
if item.is_empty() {
continue;
}
if let Some((k, v)) = item.split_once('=') {
map.insert(k.to_string(), v.to_string());
} else {
map.insert(item.to_string(), "true".to_string());
}
}
map
}
#[cfg(test)]
mod gt_tests {
use super::*;
#[test]
fn parses_gt_zygosity_from_format_sample() {
let vcf = "chr1\t10\t.\tA\tC\t.\tPASS\t.\tGT:DP\t0/1:30\n\
chr1\t20\t.\tG\tT\t.\tPASS\t.\tGT\t1/1\n\
chr1\t30\t.\tT\tA\t.\tPASS\t.\tGT\t0|1\n";
let v = read_vcf_variants(vcf).unwrap();
assert_eq!(v[0].genotype, Some(Zygosity::Het));
assert_eq!(v[1].genotype, Some(Zygosity::Hom));
assert_eq!(v[2].genotype, Some(Zygosity::Het)); }
#[test]
fn missing_sample_is_genotype_none_and_still_parses() {
let vcf = "chr1\t10\t.\tA\tC\t.\tPASS\t.\n";
let v = read_vcf_variants(vcf).unwrap();
assert_eq!(v.len(), 1);
assert_eq!(v[0].genotype, None);
}
#[test]
fn multiallelic_decomposes_per_alt_with_decomposed_genotype() {
let vcf = "chr1\t10\t.\tG\tA,C\t.\tPASS\t.\tGT\t1/2\n";
let v = read_vcf_variants(vcf).unwrap();
assert_eq!(v.len(), 2);
assert_eq!(v[0].alternate, b"A");
assert_eq!(v[0].genotype, Some(Zygosity::Het));
assert_eq!(v[1].alternate, b"C");
assert_eq!(v[1].genotype, Some(Zygosity::Het));
}
#[test]
fn multiallelic_hom_second_allele_drops_the_absent_first() {
let vcf = "chr1\t10\t.\tG\tA,C\t.\tPASS\t.\tGT\t2/2\n";
let v = read_vcf_variants(vcf).unwrap();
assert_eq!(v.len(), 1);
assert_eq!(v[0].alternate, b"C");
assert_eq!(v[0].genotype, Some(Zygosity::Hom));
}
#[test]
fn multiallelic_without_gt_emits_all_alleles_unscored() {
let vcf = "chr1\t10\t.\tG\tA,C\t.\tPASS\t.\n";
let v = read_vcf_variants(vcf).unwrap();
assert_eq!(v.len(), 2);
assert_eq!(v[0].genotype, None);
assert_eq!(v[1].genotype, None);
}
}