Skip to main content

holodeck_lib/
read_naming.rs

1//! Read naming schemes for simulated reads.
2//!
3//! Supports two modes: **encoded** names that embed truth coordinates
4//! (contig, position, strand, haplotype, error count) for downstream
5//! evaluation, and **simple** names that are just sequential identifiers.
6//!
7//! Contig names may contain `:` characters (e.g. HLA contigs). The parser
8//! handles this by splitting from the right where the field count is fixed.
9
10/// Truth alignment data for a single read, used to encode position information
11/// into the read name.
12#[derive(Debug, Clone, PartialEq, Eq)]
13pub struct TruthAlignment {
14    /// Reference contig name.
15    pub contig: String,
16    /// 1-based reference start position.
17    pub position: u32,
18    /// Whether the read is on the forward strand.
19    pub is_forward: bool,
20    /// 0-based haplotype index.
21    pub haplotype: usize,
22    /// Number of sequencing errors introduced.
23    pub n_errors: usize,
24}
25
26/// Format a read name in encoded mode for a paired-end read.
27///
28/// Format: `holodeck:READ_NUM:CONTIG:POS1+STRAND:POS2+STRAND:HAP:ERRS1:ERRS2`
29///
30/// Example: `holodeck:42:chr1:10000F:10450R:0:2:1`
31///
32/// Both R1 and R2 must be on the same contig; `r2.contig` is not encoded
33/// separately.
34#[must_use]
35pub fn encoded_pe_name(read_num: u64, r1: &TruthAlignment, r2: &TruthAlignment) -> String {
36    debug_assert_eq!(
37        r1.contig, r2.contig,
38        "R1 and R2 must be on the same contig for encoded PE names"
39    );
40    format!(
41        "holodeck:{}:{}:{}{}:{}{}:{}:{}:{}",
42        read_num,
43        r1.contig,
44        r1.position,
45        strand_char(r1.is_forward),
46        r2.position,
47        strand_char(r2.is_forward),
48        r1.haplotype,
49        r1.n_errors,
50        r2.n_errors,
51    )
52}
53
54/// Format a read name in encoded mode for a single-end read.
55///
56/// Format: `holodeck:READ_NUM:CONTIG:POS+STRAND:HAP:ERRS`
57///
58/// Example: `holodeck:42:chr1:10000F:0:2`
59#[must_use]
60pub fn encoded_se_name(read_num: u64, r1: &TruthAlignment) -> String {
61    format!(
62        "holodeck:{}:{}:{}{}:{}:{}",
63        read_num,
64        r1.contig,
65        r1.position,
66        strand_char(r1.is_forward),
67        r1.haplotype,
68        r1.n_errors,
69    )
70}
71
72/// Format a simple read name with no truth information.
73///
74/// Format: `holodeck:READ_NUM`
75#[must_use]
76pub fn simple_name(read_num: u64) -> String {
77    format!("holodeck:{read_num}")
78}
79
80/// Return the strand character for a boolean forward flag.
81fn strand_char(is_forward: bool) -> char {
82    if is_forward { 'F' } else { 'R' }
83}
84
85/// Parse an encoded paired-end read name back into truth alignments.
86///
87/// Parses from the right to handle contig names that contain `:` characters
88/// (e.g. HLA contigs like `HLA-A*01:01:01:01`).
89///
90/// Returns `(read_num, r1_truth, r2_truth)` or `None` if the name doesn't
91/// match the expected format.
92#[must_use]
93pub fn parse_encoded_pe_name(name: &str) -> Option<(u64, TruthAlignment, TruthAlignment)> {
94    // Split from the right: the last 5 fields are always
95    // pos2+strand, hap, errs_r1, errs_r2, and pos1+strand.
96    // The prefix is "holodeck:read_num:contig" (contig may contain colons).
97    let mut rev_parts: Vec<&str> = name.rsplitn(6, ':').collect();
98    if rev_parts.len() != 6 {
99        return None;
100    }
101    // rsplitn gives parts in reverse order:
102    // [n_errors_r2, n_errors_r1, hap, pos2+strand, pos1+strand, "holodeck:read_num:contig"]
103    rev_parts.reverse();
104    // Now: ["holodeck:read_num:contig", pos1+strand, pos2+strand, hap, n_errors_r1, n_errors_r2]
105
106    let prefix = rev_parts[0];
107    let (pos1, fwd1) = parse_pos_strand(rev_parts[1])?;
108    let (pos2, fwd2) = parse_pos_strand(rev_parts[2])?;
109    let haplotype: usize = rev_parts[3].parse().ok()?;
110    let n_errors_r1: usize = rev_parts[4].parse().ok()?;
111    let n_errors_r2: usize = rev_parts[5].parse().ok()?;
112
113    // Parse prefix: "holodeck:read_num:contig_with_possible_colons"
114    let prefix = prefix.strip_prefix("holodeck:")?;
115    let (read_num_str, contig) = prefix.split_once(':')?;
116    let read_num: u64 = read_num_str.parse().ok()?;
117
118    let r1 = TruthAlignment {
119        contig: contig.to_string(),
120        position: pos1,
121        is_forward: fwd1,
122        haplotype,
123        n_errors: n_errors_r1,
124    };
125    let r2 = TruthAlignment {
126        contig: contig.to_string(),
127        position: pos2,
128        is_forward: fwd2,
129        haplotype,
130        n_errors: n_errors_r2,
131    };
132
133    Some((read_num, r1, r2))
134}
135
136/// Parse an encoded single-end read name back into a truth alignment.
137///
138/// Parses from the right to handle contig names that contain `:` characters.
139///
140/// Returns `(read_num, truth)` or `None` if the name doesn't match.
141#[must_use]
142pub fn parse_encoded_se_name(name: &str) -> Option<(u64, TruthAlignment)> {
143    // Last 3 fields from the right: pos+strand, hap, n_errors
144    // Prefix: "holodeck:read_num:contig"
145    let mut rev_parts: Vec<&str> = name.rsplitn(4, ':').collect();
146    if rev_parts.len() != 4 {
147        return None;
148    }
149    rev_parts.reverse();
150    // ["holodeck:read_num:contig", pos+strand, hap, n_errors]
151
152    let prefix = rev_parts[0];
153    let (pos, fwd) = parse_pos_strand(rev_parts[1])?;
154    let haplotype: usize = rev_parts[2].parse().ok()?;
155    let n_errors: usize = rev_parts[3].parse().ok()?;
156
157    let prefix = prefix.strip_prefix("holodeck:")?;
158    let (read_num_str, contig) = prefix.split_once(':')?;
159    let read_num: u64 = read_num_str.parse().ok()?;
160
161    let truth = TruthAlignment {
162        contig: contig.to_string(),
163        position: pos,
164        is_forward: fwd,
165        haplotype,
166        n_errors,
167    };
168
169    Some((read_num, truth))
170}
171
172/// Parse a position+strand field like "10000F" or "450R".
173fn parse_pos_strand(field: &str) -> Option<(u32, bool)> {
174    if field.is_empty() {
175        return None;
176    }
177    let strand_char = field.as_bytes()[field.len() - 1];
178    let is_forward = match strand_char {
179        b'F' => true,
180        b'R' => false,
181        _ => return None,
182    };
183    let pos: u32 = field[..field.len() - 1].parse().ok()?;
184    Some((pos, is_forward))
185}
186
187#[cfg(test)]
188mod tests {
189    use super::*;
190
191    fn make_truth(contig: &str, pos: u32, fwd: bool, hap: usize, errs: usize) -> TruthAlignment {
192        TruthAlignment {
193            contig: contig.to_string(),
194            position: pos,
195            is_forward: fwd,
196            haplotype: hap,
197            n_errors: errs,
198        }
199    }
200
201    #[test]
202    fn test_encoded_pe_name() {
203        let r1 = make_truth("chr1", 10000, true, 0, 2);
204        let r2 = make_truth("chr1", 10450, false, 0, 1);
205        let name = encoded_pe_name(42, &r1, &r2);
206        assert_eq!(name, "holodeck:42:chr1:10000F:10450R:0:2:1");
207    }
208
209    #[test]
210    fn test_encoded_se_name() {
211        let r1 = make_truth("chr1", 10000, true, 0, 2);
212        let name = encoded_se_name(42, &r1);
213        assert_eq!(name, "holodeck:42:chr1:10000F:0:2");
214    }
215
216    #[test]
217    fn test_simple_name() {
218        assert_eq!(simple_name(42), "holodeck:42");
219        assert_eq!(simple_name(1), "holodeck:1");
220    }
221
222    #[test]
223    fn test_parse_pe_roundtrip() {
224        let r1 = make_truth("chr1", 10000, true, 0, 2);
225        let r2 = make_truth("chr1", 10450, false, 0, 1);
226        let name = encoded_pe_name(42, &r1, &r2);
227
228        let (num, parsed_r1, parsed_r2) = parse_encoded_pe_name(&name).unwrap();
229        assert_eq!(num, 42);
230        assert_eq!(parsed_r1, r1);
231        assert_eq!(parsed_r2, r2);
232    }
233
234    #[test]
235    fn test_parse_se_roundtrip() {
236        let r1 = make_truth("chrX", 500, false, 1, 0);
237        let name = encoded_se_name(99, &r1);
238
239        let (num, parsed) = parse_encoded_se_name(&name).unwrap();
240        assert_eq!(num, 99);
241        assert_eq!(parsed, r1);
242    }
243
244    #[test]
245    fn test_parse_pe_with_colon_in_contig() {
246        // HLA contig names contain colons.
247        let r1 = make_truth("HLA-A*01:01:01:01", 100, true, 0, 0);
248        let r2 = make_truth("HLA-A*01:01:01:01", 400, false, 0, 1);
249        let name = encoded_pe_name(1, &r1, &r2);
250
251        let (num, parsed_r1, parsed_r2) = parse_encoded_pe_name(&name).unwrap();
252        assert_eq!(num, 1);
253        assert_eq!(parsed_r1.contig, "HLA-A*01:01:01:01");
254        assert_eq!(parsed_r2.contig, "HLA-A*01:01:01:01");
255        assert_eq!(parsed_r1.position, 100);
256        assert_eq!(parsed_r2.position, 400);
257    }
258
259    #[test]
260    fn test_parse_se_with_colon_in_contig() {
261        let r1 = make_truth("HLA-B*07:02", 50, false, 1, 3);
262        let name = encoded_se_name(5, &r1);
263
264        let (num, parsed) = parse_encoded_se_name(&name).unwrap();
265        assert_eq!(num, 5);
266        assert_eq!(parsed, r1);
267    }
268
269    #[test]
270    fn test_cross_format_rejection() {
271        // SE name should not parse as PE.
272        let se_name = encoded_se_name(1, &make_truth("chr1", 10, true, 0, 0));
273        assert!(parse_encoded_pe_name(&se_name).is_none());
274
275        // PE name should not parse as SE (wrong field structure from the right).
276        let pe_name = encoded_pe_name(
277            1,
278            &make_truth("chr1", 10, true, 0, 0),
279            &make_truth("chr1", 20, false, 0, 0),
280        );
281        // SE parser expects pos+strand as the 3rd-from-right field, but for PE
282        // that field is the haplotype index (a plain number, no F/R suffix).
283        assert!(parse_encoded_se_name(&pe_name).is_none());
284    }
285
286    #[test]
287    fn test_parse_invalid_names() {
288        assert!(parse_encoded_pe_name("not_holodeck:1:chr1:10F:20R:0:0:0").is_none());
289        assert!(parse_encoded_pe_name("").is_none());
290        assert!(parse_encoded_se_name("holodeck:1").is_none());
291    }
292
293    #[test]
294    fn test_parse_pos_strand() {
295        assert_eq!(parse_pos_strand("10000F"), Some((10000, true)));
296        assert_eq!(parse_pos_strand("450R"), Some((450, false)));
297        assert_eq!(parse_pos_strand("0F"), Some((0, true)));
298        assert_eq!(parse_pos_strand("X"), None);
299        assert_eq!(parse_pos_strand(""), None);
300    }
301}