pub mod genotype;
use std::path::Path;
use anyhow::{Context, Result, bail};
use noodles::vcf;
use noodles::vcf::variant::record::AlternateBases;
use crate::sequence_dict::SequenceDictionary;
use genotype::{Genotype, VariantRecord};
pub fn load_variants_for_contig(
path: &Path,
contig_name: &str,
sample_name: Option<&str>,
_dict: &SequenceDictionary,
) -> Result<Vec<VariantRecord>> {
let mut reader = vcf::io::reader::Builder::default()
.build_from_path(path)
.with_context(|| format!("Failed to open VCF: {}", path.display()))?;
let header = reader.read_header()?;
let sample_index = resolve_sample_index(&header, sample_name)?;
let mut variants = Vec::new();
for result in reader.record_bufs(&header) {
let record = result.with_context(|| "Failed to read VCF record")?;
let chrom = record.reference_sequence_name();
if chrom != contig_name {
continue;
}
let Some(pos_1based) = record.variant_start() else {
continue;
};
#[expect(clippy::cast_possible_truncation, reason = "genomic positions fit in u32")]
let position = (usize::from(pos_1based) - 1) as u32;
let ref_allele: Vec<u8> = record.reference_bases().as_bytes().to_vec();
let alt_alleles: Vec<Vec<u8>> = record
.alternate_bases()
.iter()
.filter_map(Result::ok)
.map(|a| a.as_bytes().to_vec())
.collect();
let gt_str = extract_gt_string(&record, sample_index);
let Some(gt_str) = gt_str else {
continue;
};
let genotype = Genotype::parse(>_str)?;
if !genotype.has_alt() {
continue;
}
variants.push(VariantRecord { position, ref_allele, alt_alleles, genotype });
}
variants.sort_by_key(|v| v.position);
Ok(variants)
}
pub(crate) fn validate_vcf_sample(path: &Path, sample_name: Option<&str>) -> Result<String> {
let mut reader = vcf::io::reader::Builder::default()
.build_from_path(path)
.with_context(|| format!("Failed to open VCF: {}", path.display()))?;
let header = reader.read_header()?;
let idx = resolve_sample_index(&header, sample_name)?;
Ok(header
.sample_names()
.get_index(idx)
.expect("resolve_sample_index returned a valid index")
.clone())
}
fn resolve_sample_index(header: &vcf::Header, sample_name: Option<&str>) -> Result<usize> {
let sample_names = header.sample_names();
match sample_name {
Some(name) => {
let idx = sample_names.iter().position(|s| s == name).ok_or_else(|| {
if sample_names.is_empty() {
anyhow::anyhow!("Sample '{name}' not found in VCF (VCF has no sample columns)")
} else {
let available: Vec<&str> =
sample_names.iter().map(String::as_str).take(10).collect();
anyhow::anyhow!(
"Sample '{name}' not found in VCF. Available samples: {}",
available.join(", ")
)
}
})?;
Ok(idx)
}
None => {
if sample_names.len() == 1 {
Ok(0)
} else if sample_names.is_empty() {
bail!("VCF has no sample columns")
} else {
bail!(
"VCF has {} samples but no --sample was specified. \
Available samples: {}",
sample_names.len(),
sample_names.iter().take(10).cloned().collect::<Vec<_>>().join(", ")
)
}
}
}
}
fn extract_gt_string(record: &vcf::variant::RecordBuf, sample_index: usize) -> Option<String> {
use noodles::vcf::variant::record::samples::keys::key;
use noodles::vcf::variant::record_buf::samples::sample::Value;
let samples = record.samples();
let sample = samples.get_index(sample_index)?;
let gt_value = sample.get(key::GENOTYPE)?;
match gt_value {
Some(Value::Genotype(gt)) => Some(format_genotype(gt)),
_ => None,
}
}
fn format_genotype(
gt: &noodles::vcf::variant::record_buf::samples::sample::value::Genotype,
) -> String {
use noodles::vcf::variant::record::samples::series::value::genotype::Phasing;
let alleles = gt.as_ref();
let mut parts = Vec::with_capacity(alleles.len());
let mut is_phased = false;
for (i, allele) in alleles.iter().enumerate() {
if i > 0 && allele.phasing() == Phasing::Phased {
is_phased = true;
}
match allele.position() {
Some(idx) => parts.push(idx.to_string()),
None => parts.push(".".to_string()),
}
}
let sep = if is_phased { "|" } else { "/" };
parts.join(sep)
}