1#[derive(Debug, Clone, PartialEq, Eq)]
19pub struct TruthAlignment {
20 pub contig: String,
22 pub position: u32,
24 pub is_forward: bool,
26 pub haplotype: usize,
28 pub fragment_length: u32,
31 pub n_errors: usize,
33}
34
35const SEP: &str = "::";
38
39const PREFIX: &str = "holodeck";
41
42#[must_use]
51pub fn encoded_pe_name(read_num: u64, r1: &TruthAlignment, r2: &TruthAlignment) -> String {
52 debug_assert_eq!(
53 r1.contig, r2.contig,
54 "R1 and R2 must be on the same contig for encoded PE names"
55 );
56 debug_assert_eq!(
57 r1.fragment_length, r2.fragment_length,
58 "R1 and R2 must share the same fragment length"
59 );
60 debug_assert_eq!(r1.haplotype, r2.haplotype, "R1 and R2 must share the same haplotype");
61 assert_contig_is_safe(&r1.contig);
62 format!(
63 "{PREFIX}{SEP}{read_num}{SEP}{frag_len}{SEP}{contig}{SEP}{p1}{s1}{SEP}{p2}{s2}{SEP}{hap}{SEP}{e1}{SEP}{e2}",
64 frag_len = r1.fragment_length,
65 contig = r1.contig,
66 p1 = r1.position,
67 s1 = strand_char(r1.is_forward),
68 p2 = r2.position,
69 s2 = strand_char(r2.is_forward),
70 hap = r1.haplotype,
71 e1 = r1.n_errors,
72 e2 = r2.n_errors,
73 )
74}
75
76#[must_use]
82pub fn encoded_se_name(read_num: u64, r1: &TruthAlignment) -> String {
83 assert_contig_is_safe(&r1.contig);
84 format!(
85 "{PREFIX}{SEP}{read_num}{SEP}{frag_len}{SEP}{contig}{SEP}{pos}{strand}{SEP}{hap}{SEP}{errs}",
86 frag_len = r1.fragment_length,
87 contig = r1.contig,
88 pos = r1.position,
89 strand = strand_char(r1.is_forward),
90 hap = r1.haplotype,
91 errs = r1.n_errors,
92 )
93}
94
95#[must_use]
99pub fn simple_name(read_num: u64) -> String {
100 format!("{PREFIX}{SEP}{read_num}")
101}
102
103fn assert_contig_is_safe(contig: &str) {
107 debug_assert!(
108 !contig.contains('@'),
109 "contig name must not contain '@' (FASTQ header prefix): {contig:?}",
110 );
111 debug_assert!(
112 !contig.contains(SEP),
113 "contig name must not contain '::' (field separator): {contig:?}",
114 );
115}
116
117fn strand_char(is_forward: bool) -> char {
119 if is_forward { 'F' } else { 'R' }
120}
121
122#[must_use]
128pub fn parse_encoded_pe_name(name: &str) -> Option<(u64, TruthAlignment, TruthAlignment)> {
129 let parts: Vec<&str> = name.split(SEP).collect();
130 if parts.len() != 9 || parts[0] != PREFIX {
132 return None;
133 }
134
135 let read_num: u64 = parts[1].parse().ok()?;
136 let fragment_length: u32 = parts[2].parse().ok()?;
137 let contig = parts[3];
138 if contig.is_empty() {
139 return None;
140 }
141 let (pos1, fwd1) = parse_pos_strand(parts[4])?;
142 let (pos2, fwd2) = parse_pos_strand(parts[5])?;
143 let haplotype: usize = parts[6].parse().ok()?;
144 let n_errors_r1: usize = parts[7].parse().ok()?;
145 let n_errors_r2: usize = parts[8].parse().ok()?;
146
147 let r1 = TruthAlignment {
148 contig: contig.to_string(),
149 position: pos1,
150 is_forward: fwd1,
151 haplotype,
152 fragment_length,
153 n_errors: n_errors_r1,
154 };
155 let r2 = TruthAlignment {
156 contig: contig.to_string(),
157 position: pos2,
158 is_forward: fwd2,
159 haplotype,
160 fragment_length,
161 n_errors: n_errors_r2,
162 };
163
164 Some((read_num, r1, r2))
165}
166
167#[must_use]
171pub fn parse_encoded_se_name(name: &str) -> Option<(u64, TruthAlignment)> {
172 let parts: Vec<&str> = name.split(SEP).collect();
173 if parts.len() != 7 || parts[0] != PREFIX {
175 return None;
176 }
177
178 let read_num: u64 = parts[1].parse().ok()?;
179 let fragment_length: u32 = parts[2].parse().ok()?;
180 let contig = parts[3];
181 if contig.is_empty() {
182 return None;
183 }
184 let (pos, fwd) = parse_pos_strand(parts[4])?;
185 let haplotype: usize = parts[5].parse().ok()?;
186 let n_errors: usize = parts[6].parse().ok()?;
187
188 let truth = TruthAlignment {
189 contig: contig.to_string(),
190 position: pos,
191 is_forward: fwd,
192 haplotype,
193 fragment_length,
194 n_errors,
195 };
196
197 Some((read_num, truth))
198}
199
200fn parse_pos_strand(field: &str) -> Option<(u32, bool)> {
202 if field.is_empty() {
203 return None;
204 }
205 let strand_char = field.as_bytes()[field.len() - 1];
206 let is_forward = match strand_char {
207 b'F' => true,
208 b'R' => false,
209 _ => return None,
210 };
211 let pos: u32 = field[..field.len() - 1].parse().ok()?;
212 Some((pos, is_forward))
213}
214
215#[cfg(test)]
216mod tests {
217 use super::*;
218
219 fn make_truth(
220 contig: &str,
221 pos: u32,
222 fwd: bool,
223 hap: usize,
224 frag_len: u32,
225 errs: usize,
226 ) -> TruthAlignment {
227 TruthAlignment {
228 contig: contig.to_string(),
229 position: pos,
230 is_forward: fwd,
231 haplotype: hap,
232 fragment_length: frag_len,
233 n_errors: errs,
234 }
235 }
236
237 #[test]
238 fn test_encoded_pe_name() {
239 let r1 = make_truth("chr1", 10000, true, 0, 600, 2);
240 let r2 = make_truth("chr1", 10450, false, 0, 600, 1);
241 let name = encoded_pe_name(42, &r1, &r2);
242 assert_eq!(name, "holodeck::42::600::chr1::10000F::10450R::0::2::1");
243 }
244
245 #[test]
246 fn test_encoded_se_name() {
247 let r1 = make_truth("chr1", 10000, true, 0, 275, 2);
248 let name = encoded_se_name(42, &r1);
249 assert_eq!(name, "holodeck::42::275::chr1::10000F::0::2");
250 }
251
252 #[test]
253 fn test_simple_name() {
254 assert_eq!(simple_name(42), "holodeck::42");
255 assert_eq!(simple_name(1), "holodeck::1");
256 }
257
258 #[test]
259 fn test_parse_pe_roundtrip() {
260 let r1 = make_truth("chr1", 10000, true, 0, 600, 2);
261 let r2 = make_truth("chr1", 10450, false, 0, 600, 1);
262 let name = encoded_pe_name(42, &r1, &r2);
263
264 let (num, parsed_r1, parsed_r2) = parse_encoded_pe_name(&name).unwrap();
265 assert_eq!(num, 42);
266 assert_eq!(parsed_r1, r1);
267 assert_eq!(parsed_r2, r2);
268 }
269
270 #[test]
271 fn test_parse_se_roundtrip() {
272 let r1 = make_truth("chrX", 500, false, 1, 275, 0);
273 let name = encoded_se_name(99, &r1);
274
275 let (num, parsed) = parse_encoded_se_name(&name).unwrap();
276 assert_eq!(num, 99);
277 assert_eq!(parsed, r1);
278 }
279
280 #[test]
281 fn test_parse_pe_with_colon_in_contig() {
282 let r1 = make_truth("HLA-A*01:01:01:01", 100, true, 0, 450, 0);
285 let r2 = make_truth("HLA-A*01:01:01:01", 400, false, 0, 450, 1);
286 let name = encoded_pe_name(1, &r1, &r2);
287
288 let (num, parsed_r1, parsed_r2) = parse_encoded_pe_name(&name).unwrap();
289 assert_eq!(num, 1);
290 assert_eq!(parsed_r1, r1);
291 assert_eq!(parsed_r2, r2);
292 }
293
294 #[test]
295 fn test_parse_se_with_colon_in_contig() {
296 let r1 = make_truth("HLA-B*07:02", 50, false, 1, 120, 3);
297 let name = encoded_se_name(5, &r1);
298
299 let (num, parsed) = parse_encoded_se_name(&name).unwrap();
300 assert_eq!(num, 5);
301 assert_eq!(parsed, r1);
302 }
303
304 #[test]
305 fn test_short_fragment_encodes_adapter_boundary() {
306 let r1 = make_truth("chr1", 10000, true, 0, 40, 0);
308 let r2 = make_truth("chr1", 10000, false, 0, 40, 0);
309 let name = encoded_pe_name(7, &r1, &r2);
310
311 let (_num, parsed_r1, _parsed_r2) = parse_encoded_pe_name(&name).unwrap();
312 assert_eq!(parsed_r1.fragment_length, 40);
313 }
314
315 #[test]
316 fn test_cross_format_rejection() {
317 let se_name = encoded_se_name(1, &make_truth("chr1", 10, true, 0, 150, 0));
319 assert!(parse_encoded_pe_name(&se_name).is_none());
320
321 let pe_name = encoded_pe_name(
322 1,
323 &make_truth("chr1", 10, true, 0, 200, 0),
324 &make_truth("chr1", 20, false, 0, 200, 0),
325 );
326 assert!(parse_encoded_se_name(&pe_name).is_none());
327 }
328
329 #[test]
330 fn test_parse_invalid_names() {
331 assert!(parse_encoded_pe_name("not_holodeck::1::200::chr1::10F::20R::0::0::0").is_none());
333 assert!(parse_encoded_pe_name("").is_none());
335 assert!(parse_encoded_se_name("holodeck::1").is_none());
336 assert!(parse_encoded_pe_name("holodeck:1:200:chr1:10F:20R:0:0:0").is_none());
338 assert!(parse_encoded_se_name("holodeck:1:150:chr1:10F:0:0").is_none());
339 assert!(parse_encoded_se_name("holodeck::1::150::::10F::0::0").is_none());
341 }
342
343 #[test]
344 fn test_parse_pos_strand() {
345 assert_eq!(parse_pos_strand("10000F"), Some((10000, true)));
346 assert_eq!(parse_pos_strand("450R"), Some((450, false)));
347 assert_eq!(parse_pos_strand("0F"), Some((0, true)));
348 assert_eq!(parse_pos_strand("X"), None);
349 assert_eq!(parse_pos_strand(""), None);
350 }
351
352 #[test]
353 #[should_panic(expected = "contig name must not contain '@'")]
354 fn test_formatter_rejects_at_in_contig_se() {
355 let r1 = make_truth("bad@contig", 1, true, 0, 100, 0);
356 let _ = encoded_se_name(1, &r1);
357 }
358
359 #[test]
360 #[should_panic(expected = "contig name must not contain '::'")]
361 fn test_formatter_rejects_double_colon_in_contig_pe() {
362 let r1 = make_truth("bad::contig", 1, true, 0, 100, 0);
363 let r2 = make_truth("bad::contig", 20, false, 0, 100, 0);
364 let _ = encoded_pe_name(1, &r1, &r2);
365 }
366}