use std::collections::HashMap;
use std::io::{BufRead, BufReader};
use std::path::Path;
use anyhow::{Context, Result};
use noodles_bgzf as bgzf;
use tracing::warn;
use crate::core::types::{MutationType, Variant};
#[derive(Debug, Default)]
pub struct VcfParseResult {
pub variants: Vec<Variant>,
pub skipped: usize,
}
pub type RefSequences = HashMap<String, Vec<u8>>;
pub fn parse_vcf(
path: &Path,
known_chroms: Option<&[String]>,
ref_seqs: Option<&RefSequences>,
) -> Result<VcfParseResult> {
let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
let is_bgzipped = matches!(ext, "gz" | "bgz");
if is_bgzipped {
let file = std::fs::File::open(path)
.with_context(|| format!("failed to open VCF: {}", path.display()))?;
let bgzf_reader = bgzf::io::Reader::new(file);
let buf_reader = BufReader::new(bgzf_reader);
parse_from_reader(buf_reader, known_chroms, ref_seqs)
} else {
let file = std::fs::File::open(path)
.with_context(|| format!("failed to open VCF: {}", path.display()))?;
let buf_reader = BufReader::new(file);
parse_from_reader(buf_reader, known_chroms, ref_seqs)
}
}
pub fn parse_from_reader<R: BufRead>(
reader: R,
known_chroms: Option<&[String]>,
ref_seqs: Option<&RefSequences>,
) -> Result<VcfParseResult> {
let mut result = VcfParseResult::default();
for line in reader.lines() {
let line = line.context("I/O error reading VCF")?;
if line.starts_with('#') {
continue;
}
if line.trim().is_empty() {
continue;
}
match parse_record(&line, known_chroms, ref_seqs) {
Ok(Some(variant)) => result.variants.push(variant),
Ok(None) => result.skipped += 1,
Err(e) => {
warn!("skipping malformed VCF record: {e}");
result.skipped += 1;
}
}
}
tracing::debug!(
loaded = result.variants.len(),
skipped = result.skipped,
"VCF parse complete"
);
Ok(result)
}
#[cfg(test)]
pub fn parse_vcf_str(
input: &str,
known_chroms: Option<&[String]>,
ref_seqs: Option<&RefSequences>,
) -> Result<VcfParseResult> {
parse_from_reader(input.as_bytes(), known_chroms, ref_seqs)
}
fn parse_record(
line: &str,
known_chroms: Option<&[String]>,
ref_seqs: Option<&RefSequences>,
) -> Result<Option<Variant>> {
let fields: Vec<&str> = line.splitn(9, '\t').collect();
anyhow::ensure!(fields.len() >= 5, "VCF record has fewer than 5 fields");
let chrom = fields[0];
let pos_1based: u64 = fields[1]
.parse()
.with_context(|| format!("invalid POS '{}'", fields[1]))?;
let ref_allele = fields[3];
let alt_field = fields[4];
if alt_field == "." {
warn!(
chrom = chrom,
pos = pos_1based,
"skipping record with no ALT allele"
);
return Ok(None);
}
let mut alt_iter = alt_field.splitn(2, ',');
let alt_allele = alt_iter.next().unwrap_or(alt_field);
if alt_iter.next().is_some() {
warn!(
chrom = chrom,
pos = pos_1based,
alts = alt_field,
"multi-ALT record detected; only the first ALT allele will be used"
);
}
if alt_allele.contains('[')
|| alt_allele.contains(']')
|| alt_allele.starts_with('.')
|| alt_allele.ends_with('.')
{
warn!(
chrom = chrom,
pos = pos_1based,
alt = alt_allele,
"skipping SV/BND record"
);
return Ok(None);
}
if let Some(chroms) = known_chroms {
if !chroms.iter().any(|c| c == chrom) {
warn!(
chrom = chrom,
"skipping variant on chromosome not in reference"
);
return Ok(None);
}
}
let pos_0based: u64 = pos_1based
.checked_sub(1)
.context("VCF POS is 0, which is invalid")?;
if let Some(seqs) = ref_seqs {
if let Some(seq) = seqs.get(chrom) {
let start = pos_0based as usize;
let end = start + ref_allele.len();
if end <= seq.len() {
let genome_ref = std::str::from_utf8(&seq[start..end])
.unwrap_or("")
.to_uppercase();
if genome_ref != ref_allele.to_uppercase() {
warn!(
chrom = chrom,
pos = pos_1based,
vcf_ref = ref_allele,
genome_ref = genome_ref.as_str(),
"REF allele mismatch (possible reference build difference); keeping variant"
);
}
}
}
}
let info_str = if fields.len() >= 8 { fields[7] } else { "." };
let (vaf, clone_id) = parse_info(info_str);
let mutation = classify_mutation(pos_0based, ref_allele, alt_allele)?;
Ok(Some(Variant {
chrom: chrom.to_string(),
mutation,
expected_vaf: vaf,
clone_id,
haplotype: None,
ccf: None,
}))
}
fn parse_info(info: &str) -> (f64, Option<String>) {
const DEFAULT_VAF: f64 = 0.5;
const DEFAULT_CLONE: &str = "founder";
if info == "." || info.is_empty() {
return (DEFAULT_VAF, Some(DEFAULT_CLONE.to_string()));
}
let mut vaf: Option<f64> = None;
let mut clone: Option<String> = None;
for field in info.split(';') {
if let Some(val) = field.strip_prefix("VAF=") {
vaf = val.parse().ok();
} else if vaf.is_none() {
if let Some(val) = field.strip_prefix("AF=") {
let first = val.split(',').next().unwrap_or(val);
vaf = first.parse().ok();
}
}
if let Some(val) = field.strip_prefix("CLONE=") {
clone = Some(val.to_string());
}
}
let vaf = vaf.unwrap_or(DEFAULT_VAF);
let clone = Some(clone.unwrap_or_else(|| DEFAULT_CLONE.to_string()));
(vaf, clone)
}
fn classify_mutation(pos: u64, ref_allele: &str, alt_allele: &str) -> Result<MutationType> {
let ref_bytes = ref_allele.as_bytes().to_vec();
let alt_bytes = alt_allele.as_bytes().to_vec();
let ref_len = ref_bytes.len();
let alt_len = alt_bytes.len();
anyhow::ensure!(ref_len > 0, "REF allele is empty");
anyhow::ensure!(alt_len > 0, "ALT allele is empty");
let mutation = if ref_len == 1 && alt_len == 1 {
MutationType::Snv {
pos,
ref_base: ref_bytes[0],
alt_base: alt_bytes[0],
}
} else if ref_len == alt_len && ref_len > 1 {
MutationType::Mnv {
pos,
ref_seq: ref_bytes,
alt_seq: alt_bytes,
}
} else {
MutationType::Indel {
pos,
ref_seq: ref_bytes,
alt_seq: alt_bytes,
}
};
Ok(mutation)
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
const HEADER: &str = "\
##fileformat=VCFv4.3
##INFO=<ID=VAF,Number=1,Type=Float,Description=\"VAF\">
##INFO=<ID=AF,Number=A,Type=Float,Description=\"AF\">
##INFO=<ID=CLONE,Number=1,Type=String,Description=\"Clone\">
##INFO=<ID=CN,Number=1,Type=Integer,Description=\"Copy number\">
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
";
fn known_chroms() -> Vec<String> {
vec!["chr1".to_string(), "chr2".to_string(), "chrX".to_string()]
}
#[test]
fn test_parse_snv() {
let vcf = format!("{HEADER}chr1\t1000\t.\tA\tT\t.\tPASS\t.\n");
let result = parse_vcf_str(&vcf, Some(&known_chroms()), None).unwrap();
assert_eq!(result.variants.len(), 1);
assert_eq!(result.skipped, 0);
let v = &result.variants[0];
assert_eq!(v.chrom, "chr1");
assert!(
matches!(
&v.mutation,
MutationType::Snv {
pos: 999,
ref_base: b'A',
alt_base: b'T'
}
),
"unexpected mutation: {:?}",
v.mutation
);
}
#[test]
fn test_parse_indel() {
let vcf = format!(
"{HEADER}\
chr1\t100\t.\tA\tATG\t.\tPASS\t.\n\
chr1\t200\t.\tATG\tA\t.\tPASS\t.\n"
);
let result = parse_vcf_str(&vcf, Some(&known_chroms()), None).unwrap();
assert_eq!(result.variants.len(), 2);
let ins = &result.variants[0];
assert!(
matches!(&ins.mutation, MutationType::Indel { pos: 99, .. }),
"expected insertion indel at pos 99, got {:?}",
ins.mutation
);
if let MutationType::Indel {
ref_seq, alt_seq, ..
} = &ins.mutation
{
assert_eq!(ref_seq, b"A");
assert_eq!(alt_seq, b"ATG");
}
let del = &result.variants[1];
assert!(
matches!(&del.mutation, MutationType::Indel { pos: 199, .. }),
"expected deletion indel, got {:?}",
del.mutation
);
if let MutationType::Indel {
ref_seq, alt_seq, ..
} = &del.mutation
{
assert_eq!(ref_seq, b"ATG");
assert_eq!(alt_seq, b"A");
}
}
#[test]
fn test_parse_mnv() {
let vcf = format!("{HEADER}chr1\t500\t.\tAC\tTG\t.\tPASS\t.\n");
let result = parse_vcf_str(&vcf, Some(&known_chroms()), None).unwrap();
assert_eq!(result.variants.len(), 1);
let v = &result.variants[0];
assert!(
matches!(&v.mutation, MutationType::Mnv { pos: 499, .. }),
"expected MNV, got {:?}",
v.mutation
);
if let MutationType::Mnv {
ref_seq, alt_seq, ..
} = &v.mutation
{
assert_eq!(ref_seq, b"AC");
assert_eq!(alt_seq, b"TG");
}
}
#[test]
fn test_parse_multiple() {
let vcf = format!(
"{HEADER}\
chr1\t100\t.\tA\tC\t.\tPASS\t.\n\
chr1\t200\t.\tG\tT\t.\tPASS\t.\n\
chr2\t50\t.\tA\tATTT\t.\tPASS\t.\n\
chr2\t300\t.\tGCTA\tACTG\t.\tPASS\t.\n"
);
let result = parse_vcf_str(&vcf, Some(&known_chroms()), None).unwrap();
assert_eq!(result.variants.len(), 4);
assert_eq!(result.skipped, 0);
assert_eq!(result.variants[0].chrom, "chr1");
assert_eq!(result.variants[2].chrom, "chr2");
}
#[test]
fn test_custom_vaf() {
let vcf_vaf = format!("{HEADER}chr1\t1000\t.\tA\tT\t.\tPASS\tVAF=0.12\n");
let result = parse_vcf_str(&vcf_vaf, Some(&known_chroms()), None).unwrap();
assert!((result.variants[0].expected_vaf - 0.12).abs() < 1e-9);
let vcf_af = format!("{HEADER}chr1\t1000\t.\tA\tT\t.\tPASS\tAF=0.33\n");
let result = parse_vcf_str(&vcf_af, Some(&known_chroms()), None).unwrap();
assert!((result.variants[0].expected_vaf - 0.33).abs() < 1e-9);
let vcf_both = format!("{HEADER}chr1\t1000\t.\tA\tT\t.\tPASS\tVAF=0.25;AF=0.99\n");
let result = parse_vcf_str(&vcf_both, Some(&known_chroms()), None).unwrap();
assert!((result.variants[0].expected_vaf - 0.25).abs() < 1e-9);
}
#[test]
fn test_clone_assignment() {
let vcf = format!("{HEADER}chr1\t1000\t.\tA\tT\t.\tPASS\tCLONE=subclone_B\n");
let result = parse_vcf_str(&vcf, Some(&known_chroms()), None).unwrap();
assert_eq!(result.variants[0].clone_id.as_deref(), Some("subclone_B"));
}
#[test]
fn test_missing_info_defaults() {
let vcf = format!("{HEADER}chr1\t1000\t.\tA\tT\t.\tPASS\t.\n");
let result = parse_vcf_str(&vcf, Some(&known_chroms()), None).unwrap();
let v = &result.variants[0];
assert!(
(v.expected_vaf - 0.5).abs() < 1e-9,
"expected default VAF 0.5, got {}",
v.expected_vaf
);
assert_eq!(v.clone_id.as_deref(), Some("founder"));
}
#[test]
fn test_ref_mismatch_warning() {
let mut ref_seqs: RefSequences = HashMap::new();
let mut seq = vec![b'G'; 2000];
seq[999] = b'C'; ref_seqs.insert("chr1".to_string(), seq);
let vcf = format!("{HEADER}chr1\t1000\t.\tA\tT\t.\tPASS\t.\n");
let result = parse_vcf_str(&vcf, Some(&known_chroms()), Some(&ref_seqs)).unwrap();
assert_eq!(
result.variants.len(),
1,
"REF mismatch should warn, not skip"
);
}
#[test]
fn test_skip_unknown_chrom() {
let vcf = format!(
"{HEADER}\
chr1\t1000\t.\tA\tT\t.\tPASS\t.\n\
chrUNKNOWN\t500\t.\tG\tC\t.\tPASS\t.\n\
chr2\t200\t.\tA\tG\t.\tPASS\t.\n"
);
let result = parse_vcf_str(&vcf, Some(&known_chroms()), None).unwrap();
assert_eq!(
result.variants.len(),
2,
"only chr1 and chr2 should be kept"
);
assert_eq!(result.skipped, 1, "chrUNKNOWN record should be skipped");
}
#[test]
fn test_bgzipped_vcf() {
let vcf_content = format!("{HEADER}chr1\t1000\t.\tA\tT\t.\tPASS\tVAF=0.42\n");
let mut tmp = NamedTempFile::with_suffix(".vcf.gz").unwrap();
{
let mut writer = bgzf::io::Writer::new(&mut tmp);
writer.write_all(vcf_content.as_bytes()).unwrap();
writer.try_finish().unwrap();
}
let result = parse_vcf(tmp.path(), Some(&known_chroms()), None).unwrap();
assert_eq!(
result.variants.len(),
1,
"bgzipped VCF should parse correctly"
);
let v = &result.variants[0];
assert!(
(v.expected_vaf - 0.42).abs() < 1e-9,
"VAF mismatch: {}",
v.expected_vaf
);
assert!(matches!(&v.mutation, MutationType::Snv { pos: 999, .. }));
}
}