Skip to main content

holodeck_lib/vcf/
mod.rs

1pub mod genotype;
2
3use std::path::Path;
4
5use anyhow::{Context, Result, bail};
6use noodles::vcf;
7use noodles::vcf::variant::record::AlternateBases;
8
9use crate::sequence_dict::SequenceDictionary;
10use genotype::{Genotype, VariantRecord};
11
12/// Load variant records from a VCF for a given sample and contig.
13///
14/// Reads all records for `contig_name`, parses the GT field for the selected
15/// sample, and returns a sorted list of [`VariantRecord`]s.  Records where the
16/// sample's genotype is homozygous reference or entirely missing are skipped.
17///
18/// # Arguments
19/// * `path` — Path to the VCF file.
20/// * `contig_name` — Name of the contig to load variants for.
21/// * `sample_name` — Sample name, or `None` to use the only sample in the VCF.
22/// * `_dict` — Sequence dictionary (reserved for future validation).
23///
24/// # Errors
25/// Returns an error if the VCF cannot be read, the sample is not found, or GT
26/// fields are missing or malformed.
27pub fn load_variants_for_contig(
28    path: &Path,
29    contig_name: &str,
30    sample_name: Option<&str>,
31    _dict: &SequenceDictionary,
32) -> Result<Vec<VariantRecord>> {
33    let mut reader = vcf::io::reader::Builder::default()
34        .build_from_path(path)
35        .with_context(|| format!("Failed to open VCF: {}", path.display()))?;
36
37    let header = reader.read_header()?;
38    let sample_index = resolve_sample_index(&header, sample_name)?;
39
40    let mut variants = Vec::new();
41
42    // TODO: Use tabix-indexed random access to read only the target contig.
43    // Currently reads every record and filters by contig name, which is
44    // O(total_variants) per contig call.
45    for result in reader.record_bufs(&header) {
46        let record = result.with_context(|| "Failed to read VCF record")?;
47
48        // Filter to the target contig.
49        let chrom = record.reference_sequence_name();
50        if chrom != contig_name {
51            continue;
52        }
53
54        // Parse position (VCF is 1-based, convert to 0-based).
55        let Some(pos_1based) = record.variant_start() else {
56            continue;
57        };
58        #[expect(clippy::cast_possible_truncation, reason = "genomic positions fit in u32")]
59        let position = (usize::from(pos_1based) - 1) as u32;
60
61        // Extract alleles.
62        let ref_allele: Vec<u8> = record.reference_bases().as_bytes().to_vec();
63        let alt_alleles: Vec<Vec<u8>> = record
64            .alternate_bases()
65            .iter()
66            .filter_map(Result::ok)
67            .map(|a| a.as_bytes().to_vec())
68            .collect();
69
70        // Parse genotype for the selected sample.
71        let gt_str = extract_gt_string(&record, sample_index);
72        let Some(gt_str) = gt_str else {
73            continue;
74        };
75
76        let genotype = Genotype::parse(&gt_str)?;
77
78        // Skip genotypes with no alt alleles (hom-ref, all-missing, etc.).
79        if !genotype.has_alt() {
80            continue;
81        }
82
83        variants.push(VariantRecord { position, ref_allele, alt_alleles, genotype });
84    }
85
86    // Sort by position (should already be sorted in a valid VCF, but ensure).
87    variants.sort_by_key(|v| v.position);
88
89    Ok(variants)
90}
91
92/// Validate that a VCF file has the expected sample configuration and
93/// return the resolved sample name.
94///
95/// Opens the VCF, reads the header, and checks:
96/// - If `sample_name` is `Some`, that the named sample exists.
97/// - If `sample_name` is `None`, that the VCF has exactly one sample.
98///
99/// Call this during argument validation to surface sample errors before
100/// the per-contig simulation loop begins.
101///
102/// # Errors
103/// Returns an error if the VCF cannot be read or the sample configuration
104/// is invalid.
105pub(crate) fn validate_vcf_sample(path: &Path, sample_name: Option<&str>) -> Result<String> {
106    let mut reader = vcf::io::reader::Builder::default()
107        .build_from_path(path)
108        .with_context(|| format!("Failed to open VCF: {}", path.display()))?;
109    let header = reader.read_header()?;
110    let idx = resolve_sample_index(&header, sample_name)?;
111    Ok(header
112        .sample_names()
113        .get_index(idx)
114        .expect("resolve_sample_index returned a valid index")
115        .clone())
116}
117
118/// Resolve the sample index from a VCF header.
119///
120/// If `sample_name` is `Some`, looks up that sample. If `None` and the VCF has
121/// exactly one sample, uses index 0. Errors if the VCF has multiple samples
122/// and no name was specified.
123fn resolve_sample_index(header: &vcf::Header, sample_name: Option<&str>) -> Result<usize> {
124    let sample_names = header.sample_names();
125
126    match sample_name {
127        Some(name) => {
128            let idx = sample_names.iter().position(|s| s == name).ok_or_else(|| {
129                if sample_names.is_empty() {
130                    anyhow::anyhow!("Sample '{name}' not found in VCF (VCF has no sample columns)")
131                } else {
132                    let available: Vec<&str> =
133                        sample_names.iter().map(String::as_str).take(10).collect();
134                    anyhow::anyhow!(
135                        "Sample '{name}' not found in VCF. Available samples: {}",
136                        available.join(", ")
137                    )
138                }
139            })?;
140            Ok(idx)
141        }
142        None => {
143            if sample_names.len() == 1 {
144                Ok(0)
145            } else if sample_names.is_empty() {
146                bail!("VCF has no sample columns")
147            } else {
148                bail!(
149                    "VCF has {} samples but no --sample was specified. \
150                     Available samples: {}",
151                    sample_names.len(),
152                    sample_names.iter().take(10).cloned().collect::<Vec<_>>().join(", ")
153                )
154            }
155        }
156    }
157}
158
159/// Extract the GT field as a string for a specific sample from a VCF record.
160///
161/// Returns `None` if the GT field is absent for this sample.
162fn extract_gt_string(record: &vcf::variant::RecordBuf, sample_index: usize) -> Option<String> {
163    use noodles::vcf::variant::record::samples::keys::key;
164    use noodles::vcf::variant::record_buf::samples::sample::Value;
165
166    let samples = record.samples();
167    let sample = samples.get_index(sample_index)?;
168    let gt_value = sample.get(key::GENOTYPE)?;
169
170    match gt_value {
171        Some(Value::Genotype(gt)) => Some(format_genotype(gt)),
172        _ => None,
173    }
174}
175
176/// Format a noodles `Genotype` value back to a VCF GT string (e.g. "0/1" or
177/// "1|0").
178fn format_genotype(
179    gt: &noodles::vcf::variant::record_buf::samples::sample::value::Genotype,
180) -> String {
181    use noodles::vcf::variant::record::samples::series::value::genotype::Phasing;
182
183    let alleles = gt.as_ref();
184    let mut parts = Vec::with_capacity(alleles.len());
185    let mut is_phased = false;
186
187    for (i, allele) in alleles.iter().enumerate() {
188        if i > 0 && allele.phasing() == Phasing::Phased {
189            is_phased = true;
190        }
191        match allele.position() {
192            Some(idx) => parts.push(idx.to_string()),
193            None => parts.push(".".to_string()),
194        }
195    }
196
197    let sep = if is_phased { "|" } else { "/" };
198    parts.join(sep)
199}