Skip to main content

holodeck_lib/
read_naming.rs

1//! Read naming schemes for simulated reads.
2//!
3//! Supports two modes: **encoded** names that embed truth coordinates
4//! (contig, position, strand, haplotype, fragment length, error count) for
5//! downstream evaluation, and **simple** names that are just sequential
6//! identifiers.
7//!
8//! Fields are separated by `::` (double colon). Contig names may legally
9//! contain single `:` characters (e.g. HLA contigs like `HLA-A*01:01:01:01`);
10//! using `::` as the separator keeps parsing unambiguous without requiring
11//! right-to-left tricks.
12
13/// Truth alignment data for a single read, used to encode position
14/// information into the read name.
15///
16/// In paired-end mode both reads of a pair carry the same `fragment_length`
17/// and `haplotype`.
18#[derive(Debug, Clone, PartialEq, Eq)]
19pub struct TruthAlignment {
20    /// Reference contig name.
21    pub contig: String,
22    /// 1-based reference start position.
23    pub position: u32,
24    /// Whether the read is on the forward strand.
25    pub is_forward: bool,
26    /// 0-based haplotype index.
27    pub haplotype: usize,
28    /// Length in bases of the source fragment. When shorter than the read
29    /// length, the remainder of the read is adapter (optionally N-padded).
30    pub fragment_length: u32,
31    /// Number of sequencing errors introduced.
32    pub n_errors: usize,
33}
34
35/// Field separator for encoded read names. Chosen so that contig names
36/// containing single `:` characters remain unambiguous.
37const SEP: &str = "::";
38
39/// Prefix that identifies an encoded holodeck read name.
40const PREFIX: &str = "holodeck";
41
42/// Format a read name in encoded mode for a paired-end read.
43///
44/// Format: `holodeck::READ_NUM::FRAG_LEN::CONTIG::POS1+STRAND::POS2+STRAND::HAP::ERRS1::ERRS2`
45///
46/// Example: `holodeck::42::600::chr1::10000F::10450R::0::2::1`
47///
48/// Both R1 and R2 must share the same `contig`, `haplotype`, and
49/// `fragment_length` — these fields are encoded once from `r1` only.
50#[must_use]
51pub fn encoded_pe_name(read_num: u64, r1: &TruthAlignment, r2: &TruthAlignment) -> String {
52    debug_assert_eq!(
53        r1.contig, r2.contig,
54        "R1 and R2 must be on the same contig for encoded PE names"
55    );
56    debug_assert_eq!(
57        r1.fragment_length, r2.fragment_length,
58        "R1 and R2 must share the same fragment length"
59    );
60    debug_assert_eq!(r1.haplotype, r2.haplotype, "R1 and R2 must share the same haplotype");
61    assert_contig_is_safe(&r1.contig);
62    format!(
63        "{PREFIX}{SEP}{read_num}{SEP}{frag_len}{SEP}{contig}{SEP}{p1}{s1}{SEP}{p2}{s2}{SEP}{hap}{SEP}{e1}{SEP}{e2}",
64        frag_len = r1.fragment_length,
65        contig = r1.contig,
66        p1 = r1.position,
67        s1 = strand_char(r1.is_forward),
68        p2 = r2.position,
69        s2 = strand_char(r2.is_forward),
70        hap = r1.haplotype,
71        e1 = r1.n_errors,
72        e2 = r2.n_errors,
73    )
74}
75
76/// Format a read name in encoded mode for a single-end read.
77///
78/// Format: `holodeck::READ_NUM::FRAG_LEN::CONTIG::POS+STRAND::HAP::ERRS`
79///
80/// Example: `holodeck::42::275::chr1::10000F::0::2`
81#[must_use]
82pub fn encoded_se_name(read_num: u64, r1: &TruthAlignment) -> String {
83    assert_contig_is_safe(&r1.contig);
84    format!(
85        "{PREFIX}{SEP}{read_num}{SEP}{frag_len}{SEP}{contig}{SEP}{pos}{strand}{SEP}{hap}{SEP}{errs}",
86        frag_len = r1.fragment_length,
87        contig = r1.contig,
88        pos = r1.position,
89        strand = strand_char(r1.is_forward),
90        hap = r1.haplotype,
91        errs = r1.n_errors,
92    )
93}
94
95/// Format a simple read name with no truth information.
96///
97/// Format: `holodeck::READ_NUM`
98#[must_use]
99pub fn simple_name(read_num: u64) -> String {
100    format!("{PREFIX}{SEP}{read_num}")
101}
102
103/// Panic in debug builds if `contig` contains characters that would corrupt
104/// the encoded read name: `@` (FASTQ header prefix) or `::` (field
105/// separator).
106fn assert_contig_is_safe(contig: &str) {
107    debug_assert!(
108        !contig.contains('@'),
109        "contig name must not contain '@' (FASTQ header prefix): {contig:?}",
110    );
111    debug_assert!(
112        !contig.contains(SEP),
113        "contig name must not contain '::' (field separator): {contig:?}",
114    );
115}
116
117/// Return the strand character for a boolean forward flag.
118fn strand_char(is_forward: bool) -> char {
119    if is_forward { 'F' } else { 'R' }
120}
121
122/// Parse an encoded paired-end read name back into truth alignments.
123///
124/// Returns `(read_num, r1_truth, r2_truth)` or `None` if the name doesn't
125/// match the expected format. The returned alignments share `contig`,
126/// `haplotype`, and `fragment_length`.
127#[must_use]
128pub fn parse_encoded_pe_name(name: &str) -> Option<(u64, TruthAlignment, TruthAlignment)> {
129    let parts: Vec<&str> = name.split(SEP).collect();
130    // holodeck, read_num, frag_len, contig, pos1+strand, pos2+strand, hap, errs1, errs2.
131    if parts.len() != 9 || parts[0] != PREFIX {
132        return None;
133    }
134
135    let read_num: u64 = parts[1].parse().ok()?;
136    let fragment_length: u32 = parts[2].parse().ok()?;
137    let contig = parts[3];
138    if contig.is_empty() {
139        return None;
140    }
141    let (pos1, fwd1) = parse_pos_strand(parts[4])?;
142    let (pos2, fwd2) = parse_pos_strand(parts[5])?;
143    let haplotype: usize = parts[6].parse().ok()?;
144    let n_errors_r1: usize = parts[7].parse().ok()?;
145    let n_errors_r2: usize = parts[8].parse().ok()?;
146
147    let r1 = TruthAlignment {
148        contig: contig.to_string(),
149        position: pos1,
150        is_forward: fwd1,
151        haplotype,
152        fragment_length,
153        n_errors: n_errors_r1,
154    };
155    let r2 = TruthAlignment {
156        contig: contig.to_string(),
157        position: pos2,
158        is_forward: fwd2,
159        haplotype,
160        fragment_length,
161        n_errors: n_errors_r2,
162    };
163
164    Some((read_num, r1, r2))
165}
166
167/// Parse an encoded single-end read name back into a truth alignment.
168///
169/// Returns `(read_num, truth)` or `None` if the name doesn't match.
170#[must_use]
171pub fn parse_encoded_se_name(name: &str) -> Option<(u64, TruthAlignment)> {
172    let parts: Vec<&str> = name.split(SEP).collect();
173    // holodeck, read_num, frag_len, contig, pos+strand, hap, errs.
174    if parts.len() != 7 || parts[0] != PREFIX {
175        return None;
176    }
177
178    let read_num: u64 = parts[1].parse().ok()?;
179    let fragment_length: u32 = parts[2].parse().ok()?;
180    let contig = parts[3];
181    if contig.is_empty() {
182        return None;
183    }
184    let (pos, fwd) = parse_pos_strand(parts[4])?;
185    let haplotype: usize = parts[5].parse().ok()?;
186    let n_errors: usize = parts[6].parse().ok()?;
187
188    let truth = TruthAlignment {
189        contig: contig.to_string(),
190        position: pos,
191        is_forward: fwd,
192        haplotype,
193        fragment_length,
194        n_errors,
195    };
196
197    Some((read_num, truth))
198}
199
200/// Parse a position+strand field like "10000F" or "450R".
201fn parse_pos_strand(field: &str) -> Option<(u32, bool)> {
202    if field.is_empty() {
203        return None;
204    }
205    let strand_char = field.as_bytes()[field.len() - 1];
206    let is_forward = match strand_char {
207        b'F' => true,
208        b'R' => false,
209        _ => return None,
210    };
211    let pos: u32 = field[..field.len() - 1].parse().ok()?;
212    Some((pos, is_forward))
213}
214
215#[cfg(test)]
216mod tests {
217    use super::*;
218
219    fn make_truth(
220        contig: &str,
221        pos: u32,
222        fwd: bool,
223        hap: usize,
224        frag_len: u32,
225        errs: usize,
226    ) -> TruthAlignment {
227        TruthAlignment {
228            contig: contig.to_string(),
229            position: pos,
230            is_forward: fwd,
231            haplotype: hap,
232            fragment_length: frag_len,
233            n_errors: errs,
234        }
235    }
236
237    #[test]
238    fn test_encoded_pe_name() {
239        let r1 = make_truth("chr1", 10000, true, 0, 600, 2);
240        let r2 = make_truth("chr1", 10450, false, 0, 600, 1);
241        let name = encoded_pe_name(42, &r1, &r2);
242        assert_eq!(name, "holodeck::42::600::chr1::10000F::10450R::0::2::1");
243    }
244
245    #[test]
246    fn test_encoded_se_name() {
247        let r1 = make_truth("chr1", 10000, true, 0, 275, 2);
248        let name = encoded_se_name(42, &r1);
249        assert_eq!(name, "holodeck::42::275::chr1::10000F::0::2");
250    }
251
252    #[test]
253    fn test_simple_name() {
254        assert_eq!(simple_name(42), "holodeck::42");
255        assert_eq!(simple_name(1), "holodeck::1");
256    }
257
258    #[test]
259    fn test_parse_pe_roundtrip() {
260        let r1 = make_truth("chr1", 10000, true, 0, 600, 2);
261        let r2 = make_truth("chr1", 10450, false, 0, 600, 1);
262        let name = encoded_pe_name(42, &r1, &r2);
263
264        let (num, parsed_r1, parsed_r2) = parse_encoded_pe_name(&name).unwrap();
265        assert_eq!(num, 42);
266        assert_eq!(parsed_r1, r1);
267        assert_eq!(parsed_r2, r2);
268    }
269
270    #[test]
271    fn test_parse_se_roundtrip() {
272        let r1 = make_truth("chrX", 500, false, 1, 275, 0);
273        let name = encoded_se_name(99, &r1);
274
275        let (num, parsed) = parse_encoded_se_name(&name).unwrap();
276        assert_eq!(num, 99);
277        assert_eq!(parsed, r1);
278    }
279
280    #[test]
281    fn test_parse_pe_with_colon_in_contig() {
282        // HLA contig names contain single colons, which is the whole point of
283        // using `::` as the separator.
284        let r1 = make_truth("HLA-A*01:01:01:01", 100, true, 0, 450, 0);
285        let r2 = make_truth("HLA-A*01:01:01:01", 400, false, 0, 450, 1);
286        let name = encoded_pe_name(1, &r1, &r2);
287
288        let (num, parsed_r1, parsed_r2) = parse_encoded_pe_name(&name).unwrap();
289        assert_eq!(num, 1);
290        assert_eq!(parsed_r1, r1);
291        assert_eq!(parsed_r2, r2);
292    }
293
294    #[test]
295    fn test_parse_se_with_colon_in_contig() {
296        let r1 = make_truth("HLA-B*07:02", 50, false, 1, 120, 3);
297        let name = encoded_se_name(5, &r1);
298
299        let (num, parsed) = parse_encoded_se_name(&name).unwrap();
300        assert_eq!(num, 5);
301        assert_eq!(parsed, r1);
302    }
303
304    #[test]
305    fn test_short_fragment_encodes_adapter_boundary() {
306        // A 40-base fragment read at 150bp read length: bases[40..] are adapter.
307        let r1 = make_truth("chr1", 10000, true, 0, 40, 0);
308        let r2 = make_truth("chr1", 10000, false, 0, 40, 0);
309        let name = encoded_pe_name(7, &r1, &r2);
310
311        let (_num, parsed_r1, _parsed_r2) = parse_encoded_pe_name(&name).unwrap();
312        assert_eq!(parsed_r1.fragment_length, 40);
313    }
314
315    #[test]
316    fn test_cross_format_rejection() {
317        // SE name should not parse as PE and vice versa — field counts differ.
318        let se_name = encoded_se_name(1, &make_truth("chr1", 10, true, 0, 150, 0));
319        assert!(parse_encoded_pe_name(&se_name).is_none());
320
321        let pe_name = encoded_pe_name(
322            1,
323            &make_truth("chr1", 10, true, 0, 200, 0),
324            &make_truth("chr1", 20, false, 0, 200, 0),
325        );
326        assert!(parse_encoded_se_name(&pe_name).is_none());
327    }
328
329    #[test]
330    fn test_parse_invalid_names() {
331        // Wrong prefix.
332        assert!(parse_encoded_pe_name("not_holodeck::1::200::chr1::10F::20R::0::0::0").is_none());
333        // Empty.
334        assert!(parse_encoded_pe_name("").is_none());
335        assert!(parse_encoded_se_name("holodeck::1").is_none());
336        // Pre-existing single-colon format must not parse under the new scheme.
337        assert!(parse_encoded_pe_name("holodeck:1:200:chr1:10F:20R:0:0:0").is_none());
338        assert!(parse_encoded_se_name("holodeck:1:150:chr1:10F:0:0").is_none());
339        // Empty contig.
340        assert!(parse_encoded_se_name("holodeck::1::150::::10F::0::0").is_none());
341    }
342
343    #[test]
344    fn test_parse_pos_strand() {
345        assert_eq!(parse_pos_strand("10000F"), Some((10000, true)));
346        assert_eq!(parse_pos_strand("450R"), Some((450, false)));
347        assert_eq!(parse_pos_strand("0F"), Some((0, true)));
348        assert_eq!(parse_pos_strand("X"), None);
349        assert_eq!(parse_pos_strand(""), None);
350    }
351
352    #[test]
353    #[should_panic(expected = "contig name must not contain '@'")]
354    fn test_formatter_rejects_at_in_contig_se() {
355        let r1 = make_truth("bad@contig", 1, true, 0, 100, 0);
356        let _ = encoded_se_name(1, &r1);
357    }
358
359    #[test]
360    #[should_panic(expected = "contig name must not contain '::'")]
361    fn test_formatter_rejects_double_colon_in_contig_pe() {
362        let r1 = make_truth("bad::contig", 1, true, 0, 100, 0);
363        let r2 = make_truth("bad::contig", 20, false, 0, 100, 0);
364        let _ = encoded_pe_name(1, &r1, &r2);
365    }
366}