Skip to main content

holodeck_lib/
ploidy.rs

1use anyhow::{Result, bail};
2
3/// A ploidy entry specifying the ploidy for a genomic region.
4#[derive(Debug, Clone)]
5struct PloidyEntry {
6    /// Contig name (e.g. "chrX"). None means all contigs.
7    contig: Option<String>,
8    /// Genomic range within the contig. None means the entire contig.
9    /// Coordinates are 0-based half-open `[start, end)`.
10    range: Option<(u32, u32)>,
11    /// Ploidy value.
12    ploidy: u8,
13}
14
15/// Maps genomic positions to ploidy values, supporting a global default with
16/// per-contig and per-region overrides.
17///
18/// Overrides are processed in order and use last-writer-wins semantics: later
19/// entries override earlier ones for overlapping positions. This allows
20/// patterns like:
21///
22/// ```text
23/// --ploidy 2 --ploidy-override chrX=1 --ploidy-override chrX:10001-2781479=2
24/// ```
25///
26/// which sets diploid genome-wide, then haploid chrX, then diploid PAR1 on
27/// chrX.
28#[derive(Debug, Clone)]
29pub struct PloidyMap {
30    /// Default ploidy for positions not covered by any override.
31    default_ploidy: u8,
32    /// Ordered list of overrides; later entries take precedence.
33    overrides: Vec<PloidyEntry>,
34}
35
36impl PloidyMap {
37    /// Create a new ploidy map with the given default and overrides.
38    ///
39    /// Override strings have the format `CONTIG=PLOIDY` (whole contig) or
40    /// `CONTIG:START-END=PLOIDY` (1-based inclusive coordinates, converted
41    /// internally to 0-based half-open).
42    ///
43    /// # Errors
44    /// Returns an error if any override string is malformed.
45    pub fn new(default_ploidy: u8, override_specs: &[String]) -> Result<Self> {
46        let mut overrides = Vec::with_capacity(override_specs.len());
47
48        for spec in override_specs {
49            let entry = parse_ploidy_override(spec)?;
50            overrides.push(entry);
51        }
52
53        Ok(Self { default_ploidy, overrides })
54    }
55
56    /// Look up the ploidy for a specific contig and 0-based position.
57    ///
58    /// Returns the last matching override, or the default if no override
59    /// applies.
60    #[must_use]
61    pub fn ploidy_at(&self, contig: &str, position: u32) -> u8 {
62        let mut result = self.default_ploidy;
63
64        for entry in &self.overrides {
65            let contig_matches = entry.contig.as_ref().is_none_or(|c| c == contig);
66
67            if !contig_matches {
68                continue;
69            }
70
71            let range_matches = entry
72                .range
73                .as_ref()
74                .is_none_or(|&(start, end)| position >= start && position < end);
75
76            if range_matches {
77                result = entry.ploidy;
78            }
79        }
80
81        result
82    }
83
84    /// Return the default ploidy.
85    #[must_use]
86    pub fn default_ploidy(&self) -> u8 {
87        self.default_ploidy
88    }
89}
90
91/// Parse a single ploidy override specification.
92///
93/// Formats:
94///   - `CONTIG=PLOIDY` — applies to entire contig
95///   - `CONTIG:START-END=PLOIDY` — applies to 1-based inclusive range
96fn parse_ploidy_override(spec: &str) -> Result<PloidyEntry> {
97    let (location, ploidy_str) = spec
98        .rsplit_once('=')
99        .ok_or_else(|| anyhow::anyhow!("Invalid ploidy override format: '{spec}'. Expected CONTIG=PLOIDY or CONTIG:START-END=PLOIDY"))?;
100
101    let ploidy: u8 = ploidy_str
102        .parse()
103        .map_err(|_| anyhow::anyhow!("Invalid ploidy value in override: '{ploidy_str}'"))?;
104
105    if let Some((contig, range_str)) = location.split_once(':') {
106        // CONTIG:START-END format (1-based inclusive input).
107        let (start_str, end_str) = range_str.split_once('-').ok_or_else(|| {
108            anyhow::anyhow!(
109                "Invalid range format in ploidy override: '{range_str}'. Expected START-END"
110            )
111        })?;
112
113        let start_1based: u32 = start_str.parse().map_err(|_| {
114            anyhow::anyhow!("Invalid start position in ploidy override: '{start_str}'")
115        })?;
116        let end_1based: u32 = end_str
117            .parse()
118            .map_err(|_| anyhow::anyhow!("Invalid end position in ploidy override: '{end_str}'"))?;
119
120        if start_1based == 0 {
121            bail!("Ploidy override start position must be >= 1: '{spec}'");
122        }
123        if end_1based < start_1based {
124            bail!("Ploidy override end < start: '{spec}'");
125        }
126
127        // Convert to 0-based half-open.
128        let start_0based = start_1based - 1;
129        let end_0based = end_1based; // 1-based inclusive end == 0-based exclusive end
130
131        Ok(PloidyEntry {
132            contig: Some(contig.to_string()),
133            range: Some((start_0based, end_0based)),
134            ploidy,
135        })
136    } else {
137        // CONTIG=PLOIDY format (whole contig).
138        Ok(PloidyEntry { contig: Some(location.to_string()), range: None, ploidy })
139    }
140}
141
142#[cfg(test)]
143mod tests {
144    use super::*;
145
146    #[test]
147    fn test_default_ploidy() {
148        let map = PloidyMap::new(2, &[]).unwrap();
149        assert_eq!(map.ploidy_at("chr1", 0), 2);
150        assert_eq!(map.ploidy_at("chrX", 1000), 2);
151    }
152
153    #[test]
154    fn test_whole_contig_override() {
155        let map = PloidyMap::new(2, &["chrX=1".to_string()]).unwrap();
156        assert_eq!(map.ploidy_at("chr1", 0), 2);
157        assert_eq!(map.ploidy_at("chrX", 0), 1);
158        assert_eq!(map.ploidy_at("chrX", 999_999), 1);
159    }
160
161    #[test]
162    fn test_range_override() {
163        let overrides = vec![
164            "chrX=1".to_string(),
165            "chrX:10001-2781479=2".to_string(), // PAR1
166        ];
167        let map = PloidyMap::new(2, &overrides).unwrap();
168
169        // Autosome: default 2
170        assert_eq!(map.ploidy_at("chr1", 100), 2);
171        // chrX before PAR1: haploid
172        assert_eq!(map.ploidy_at("chrX", 5000), 1);
173        // chrX in PAR1 (1-based 10001-2781479 → 0-based 10000-2781479): diploid
174        assert_eq!(map.ploidy_at("chrX", 10000), 2);
175        assert_eq!(map.ploidy_at("chrX", 2_781_478), 2);
176        // chrX just past PAR1: haploid again
177        assert_eq!(map.ploidy_at("chrX", 2_781_479), 1);
178    }
179
180    #[test]
181    fn test_last_writer_wins() {
182        // Two overlapping overrides for chrX.
183        let overrides = vec!["chrX=1".to_string(), "chrX=3".to_string()];
184        let map = PloidyMap::new(2, &overrides).unwrap();
185        // Last writer (3) wins.
186        assert_eq!(map.ploidy_at("chrX", 0), 3);
187    }
188
189    #[test]
190    fn test_parse_errors() {
191        // Missing equals sign.
192        assert!(PloidyMap::new(2, &["chrX".to_string()]).is_err());
193        // Non-numeric ploidy.
194        assert!(PloidyMap::new(2, &["chrX=abc".to_string()]).is_err());
195        // Bad range format.
196        assert!(PloidyMap::new(2, &["chrX:100=1".to_string()]).is_err());
197        // Zero start position.
198        assert!(PloidyMap::new(2, &["chrX:0-100=1".to_string()]).is_err());
199        // End < start.
200        assert!(PloidyMap::new(2, &["chrX:200-100=1".to_string()]).is_err());
201    }
202
203    #[test]
204    fn test_multiple_contigs_and_ranges() {
205        let overrides = vec![
206            "chrX=1".to_string(),
207            "chrY=1".to_string(),
208            "chrX:10001-2781479=2".to_string(),
209            "chrX:155701383-156030895=2".to_string(),
210        ];
211        let map = PloidyMap::new(2, &overrides).unwrap();
212
213        assert_eq!(map.ploidy_at("chr1", 100), 2);
214        assert_eq!(map.ploidy_at("chrX", 5000), 1);
215        assert_eq!(map.ploidy_at("chrX", 50000), 2); // PAR1
216        assert_eq!(map.ploidy_at("chrX", 100_000_000), 1); // middle of chrX
217        assert_eq!(map.ploidy_at("chrX", 155_800_000), 2); // PAR2
218        assert_eq!(map.ploidy_at("chrY", 100), 1);
219    }
220}