Skip to main content

holodeck_lib/vcf/
genotype.rs

1use anyhow::{Result, bail};
2
3/// A parsed genotype for a single sample at a variant site.
4///
5/// Alleles are 0-indexed where 0 means reference and 1+ means the Nth
6/// alternate allele.  `None` represents a missing allele (`.` in VCF).
7#[derive(Debug, Clone, PartialEq, Eq)]
8pub struct Genotype {
9    /// Allele indices for each haplotype.  Length equals ploidy at this site.
10    alleles: Vec<Option<usize>>,
11    /// Whether the genotype is phased (`|` separator) or unphased (`/`).
12    phased: bool,
13}
14
15impl Genotype {
16    /// Parse a VCF GT field string (e.g. `"0/1"`, `"1|0"`, `"0|1|2"`,
17    /// `"./."`, `"0"`).
18    ///
19    /// Returns the parsed genotype with allele indices and phasing information.
20    ///
21    /// # Errors
22    /// Returns an error if the GT string is empty or contains invalid allele
23    /// values.
24    pub fn parse(gt: &str) -> Result<Self> {
25        if gt.is_empty() {
26            bail!("Empty GT field");
27        }
28
29        // Determine separator: phased (|) or unphased (/).
30        // A haploid genotype (e.g. "0") has no separator; treat as phased.
31        let (sep, phased) = if gt.contains('|') {
32            ('|', true)
33        } else if gt.contains('/') {
34            ('/', false)
35        } else {
36            // Haploid: single allele, no separator.
37            let allele = parse_allele(gt)?;
38            return Ok(Self { alleles: vec![allele], phased: true });
39        };
40
41        let alleles: Result<Vec<Option<usize>>> = gt.split(sep).map(parse_allele).collect();
42        let alleles = alleles?;
43
44        if alleles.is_empty() {
45            bail!("GT field has no alleles: {gt}");
46        }
47
48        Ok(Self { alleles, phased })
49    }
50
51    /// Return the allele indices (one per haplotype).
52    #[must_use]
53    pub fn alleles(&self) -> &[Option<usize>] {
54        &self.alleles
55    }
56
57    /// Return whether this genotype is phased.
58    #[must_use]
59    pub fn is_phased(&self) -> bool {
60        self.phased
61    }
62
63    /// Return the ploidy (number of allele slots) at this site.
64    #[must_use]
65    pub fn ploidy(&self) -> usize {
66        self.alleles.len()
67    }
68
69    /// Return `true` if any allele is non-reference (index > 0).
70    #[must_use]
71    pub fn has_alt(&self) -> bool {
72        self.alleles.iter().any(|a| matches!(a, Some(idx) if *idx > 0))
73    }
74
75    /// Return `true` if all alleles are the same non-reference allele.
76    #[must_use]
77    pub fn is_hom_alt(&self) -> bool {
78        if !self.has_alt() {
79            return false;
80        }
81        let first = self.alleles[0];
82        first.is_some() && self.alleles.iter().all(|a| *a == first)
83    }
84
85    /// Return `true` if all alleles are reference (index 0).
86    #[must_use]
87    pub fn is_hom_ref(&self) -> bool {
88        self.alleles.iter().all(|a| *a == Some(0))
89    }
90}
91
92/// Parse a single allele field: `"."` → `None`, digits → `Some(n)`.
93fn parse_allele(s: &str) -> Result<Option<usize>> {
94    if s == "." {
95        Ok(None)
96    } else {
97        let idx: usize = s.parse().map_err(|_| anyhow::anyhow!("Invalid allele value: '{s}'"))?;
98        Ok(Some(idx))
99    }
100}
101
102/// A variant record with its position, alleles, and genotype for the selected
103/// sample.
104#[derive(Debug, Clone)]
105pub struct VariantRecord {
106    /// 0-based reference position.
107    pub position: u32,
108    /// Reference allele bases.
109    pub ref_allele: Vec<u8>,
110    /// Alternate allele sequences, indexed by alt allele number (0-based among
111    /// alternates, so alt allele 1 in VCF is index 0 here).
112    pub alt_alleles: Vec<Vec<u8>>,
113    /// Parsed genotype for the selected sample.
114    pub genotype: Genotype,
115}
116
117impl VariantRecord {
118    /// Return the allele sequence for a given allele index (0 = ref, 1+ = alt).
119    ///
120    /// Returns `None` if the index is out of range.
121    #[must_use]
122    pub fn allele_bases(&self, allele_index: usize) -> Option<&[u8]> {
123        if allele_index == 0 {
124            Some(&self.ref_allele)
125        } else {
126            self.alt_alleles.get(allele_index - 1).map(Vec::as_slice)
127        }
128    }
129}
130
131#[cfg(test)]
132mod tests {
133    use super::*;
134
135    #[test]
136    fn test_parse_diploid_unphased_het() {
137        let gt = Genotype::parse("0/1").unwrap();
138        assert_eq!(gt.alleles(), &[Some(0), Some(1)]);
139        assert!(!gt.is_phased());
140        assert_eq!(gt.ploidy(), 2);
141        assert!(gt.has_alt());
142        assert!(!gt.is_hom_alt());
143        assert!(!gt.is_hom_ref());
144    }
145
146    #[test]
147    fn test_parse_diploid_phased_het() {
148        let gt = Genotype::parse("1|0").unwrap();
149        assert_eq!(gt.alleles(), &[Some(1), Some(0)]);
150        assert!(gt.is_phased());
151        assert!(gt.has_alt());
152    }
153
154    #[test]
155    fn test_parse_diploid_hom_ref() {
156        let gt = Genotype::parse("0/0").unwrap();
157        assert_eq!(gt.alleles(), &[Some(0), Some(0)]);
158        assert!(!gt.has_alt());
159        assert!(gt.is_hom_ref());
160        assert!(!gt.is_hom_alt());
161    }
162
163    #[test]
164    fn test_parse_diploid_hom_alt() {
165        let gt = Genotype::parse("1/1").unwrap();
166        assert_eq!(gt.alleles(), &[Some(1), Some(1)]);
167        assert!(gt.has_alt());
168        assert!(gt.is_hom_alt());
169        assert!(!gt.is_hom_ref());
170    }
171
172    #[test]
173    fn test_parse_triploid() {
174        let gt = Genotype::parse("0|1|2").unwrap();
175        assert_eq!(gt.alleles(), &[Some(0), Some(1), Some(2)]);
176        assert!(gt.is_phased());
177        assert_eq!(gt.ploidy(), 3);
178    }
179
180    #[test]
181    fn test_parse_missing_alleles() {
182        let gt = Genotype::parse("./.").unwrap();
183        assert_eq!(gt.alleles(), &[None, None]);
184        assert!(!gt.has_alt());
185        assert!(!gt.is_hom_ref());
186    }
187
188    #[test]
189    fn test_parse_haploid() {
190        let gt = Genotype::parse("0").unwrap();
191        assert_eq!(gt.alleles(), &[Some(0)]);
192        assert!(gt.is_phased()); // haploid treated as phased
193        assert_eq!(gt.ploidy(), 1);
194
195        let gt = Genotype::parse("1").unwrap();
196        assert!(gt.has_alt());
197    }
198
199    #[test]
200    fn test_parse_empty_errors() {
201        assert!(Genotype::parse("").is_err());
202    }
203
204    #[test]
205    fn test_parse_invalid_allele_errors() {
206        assert!(Genotype::parse("0/abc").is_err());
207    }
208
209    #[test]
210    fn test_variant_record_allele_bases() {
211        let vr = VariantRecord {
212            position: 100,
213            ref_allele: b"A".to_vec(),
214            alt_alleles: vec![b"T".to_vec(), b"GCC".to_vec()],
215            genotype: Genotype::parse("0/1").unwrap(),
216        };
217        assert_eq!(vr.allele_bases(0), Some(b"A".as_slice()));
218        assert_eq!(vr.allele_bases(1), Some(b"T".as_slice()));
219        assert_eq!(vr.allele_bases(2), Some(b"GCC".as_slice()));
220        assert_eq!(vr.allele_bases(3), None);
221    }
222}