1use gtf_splice_index::types::RefBlock;
9use rust_htslib::bam::Record;
10use rust_htslib::bam::record::{Cigar, CigarStringView};
11
12#[derive(Debug, Clone, Copy, PartialEq, Eq)]
14pub enum Strand {
15 Plus,
16 Minus,
17 Unknown,
18}
19
20#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum ReadOpKind {
23 Match,
24 Equal,
25 Diff,
26 Ins,
27 Del,
28 RefSkip,
29 SoftClip,
30 HardClip,
31 Pad,
32}
33
34#[derive(Debug, Clone, Copy, PartialEq, Eq)]
36pub struct ReadOp {
37 pub kind: ReadOpKind,
38 pub len: u32,
39 pub ref_start0: u32,
40 pub read_start: u32,
41}
42
43#[derive(Debug, Clone, Copy, PartialEq, Eq)]
45pub struct ObservedBase {
46 pub base: u8,
47 pub qual: Option<u8>,
48 pub read_pos: u32,
49}
50
51#[derive(Debug, Clone, PartialEq, Eq)]
53pub struct AlignedRead {
54 pub chr_id: usize,
55 pub strand: Strand,
56 pub ref_start0: u32,
57 pub seq: Vec<u8>,
58 pub qual: Option<Vec<u8>>,
59 pub ops: Vec<ReadOp>,
60 finalized: bool,
61}
62
63impl ReadOpKind {
64 pub fn consumes_ref(&self) -> bool {
66 matches!(
67 self,
68 Self::Match | Self::Equal | Self::Diff | Self::Del | Self::RefSkip
69 )
70 }
71
72 pub fn consumes_read(&self) -> bool {
74 matches!(
75 self,
76 Self::Match | Self::Equal | Self::Diff | Self::Ins | Self::SoftClip
77 )
78 }
79
80 pub fn aligned_bases(&self) -> bool {
82 matches!(self, Self::Match | Self::Equal | Self::Diff)
83 }
84}
85
86impl ReadOp {
87 pub fn new(kind: ReadOpKind, len: u32, ref_start0: u32, read_start: u32) -> Self {
89 Self {
90 kind,
91 len,
92 ref_start0,
93 read_start,
94 }
95 }
96
97 pub fn ref_end0(&self) -> u32 {
99 if self.kind.consumes_ref() {
100 self.ref_start0 + self.len
101 } else {
102 self.ref_start0
103 }
104 }
105
106 pub fn read_end(&self) -> u32 {
108 if self.kind.consumes_read() {
109 self.read_start + self.len
110 } else {
111 self.read_start
112 }
113 }
114
115 pub fn can_observe_reference_base(&self) -> bool {
117 self.kind.aligned_bases()
118 }
119
120 pub fn contains_ref_pos(&self, pos0: u32) -> bool {
122 self.kind.consumes_ref() && self.ref_start0 <= pos0 && pos0 < self.ref_end0()
123 }
124
125 pub fn read_pos_for_ref_pos(&self, pos0: u32) -> Option<u32> {
127 if !self.can_observe_reference_base() || !self.contains_ref_pos(pos0) {
128 return None;
129 }
130
131 Some(self.read_start + (pos0 - self.ref_start0))
132 }
133}
134
135impl ObservedBase {
136 pub fn new(base: u8, qual: Option<u8>, read_pos: u32) -> Self {
138 Self {
139 base: base.to_ascii_uppercase(),
140 qual,
141 read_pos,
142 }
143 }
144}
145
146impl AlignedRead {
147 pub fn new(
152 chr_id: usize,
153 strand: Strand,
154 ref_start0: u32,
155 seq: Vec<u8>,
156 qual: Option<Vec<u8>>,
157 ops_input: Vec<(ReadOpKind, u32)>,
158 ) -> Self {
159 let ops = Self::build_positioned_ops(ref_start0, &ops_input);
160
161 Self {
162 chr_id,
163 strand,
164 ref_start0,
165 seq: Self::uppercase_seq(seq),
166 qual,
167 ops,
168 finalized: false,
169 }
170 }
171
172 pub fn build_positioned_ops(ref_start0: u32, ops_input: &[(ReadOpKind, u32)]) -> Vec<ReadOp> {
174 let mut ref_pos = ref_start0;
175 let mut read_pos = 0u32;
176 let mut ops = Vec::with_capacity(ops_input.len());
177
178 for (kind, len) in ops_input {
179 ops.push(ReadOp::new(*kind, *len, ref_pos, read_pos));
180
181 if kind.consumes_ref() {
182 ref_pos += *len;
183 }
184
185 if kind.consumes_read() {
186 read_pos += *len;
187 }
188 }
189
190 ops
191 }
192
193 pub fn from_record(record: &Record, chr_id: usize) -> Self {
198 let strand = if record.is_reverse() {
199 Strand::Minus
200 } else {
201 Strand::Plus
202 };
203
204 Self::new(
205 chr_id,
206 strand,
207 record.pos() as u32,
208 record.seq().as_bytes(),
209 Some(record.qual().to_vec()),
210 Self::cigar_to_read_ops(&record.cigar()),
211 )
212 }
213
214 pub fn ref_blocks(&self) -> Vec<RefBlock> {
222 let mut blocks: Vec<RefBlock> = Vec::new();
223
224 for op in &self.ops {
225 if !op.kind.aligned_bases() {
226 continue;
227 }
228
229 let new_block = RefBlock {
230 start: op.ref_start0,
231 end: op.ref_end0(),
232 };
233
234 match blocks.last_mut() {
235 Some(last) if new_block.start <= last.end => {
236 last.end = last.end.max(new_block.end);
238 }
239 _ => blocks.push(new_block),
240 }
241 }
242
243 blocks
244 }
245
246 fn cigar_to_read_ops(cigar: &CigarStringView) -> Vec<(ReadOpKind, u32)> {
248 cigar
249 .iter()
250 .map(|op| Self::cigar_op_to_read_op(*op))
251 .collect()
252 }
253
254 pub fn cigar_op_to_read_op(op: Cigar) -> (ReadOpKind, u32) {
256 match op {
257 Cigar::Match(len) => (ReadOpKind::Match, len),
258 Cigar::Ins(len) => (ReadOpKind::Ins, len),
259 Cigar::Del(len) => (ReadOpKind::Del, len),
260 Cigar::RefSkip(len) => (ReadOpKind::RefSkip, len),
261 Cigar::SoftClip(len) => (ReadOpKind::SoftClip, len),
262 Cigar::HardClip(len) => (ReadOpKind::HardClip, len),
263 Cigar::Pad(len) => (ReadOpKind::Pad, len),
264 Cigar::Equal(len) => (ReadOpKind::Equal, len),
265 Cigar::Diff(len) => (ReadOpKind::Diff, len),
266 }
267 }
268
269 pub fn finalize(&mut self) -> Result<(), String> {
273 self.validate()?;
274 self.finalized = true;
275 Ok(())
276 }
277
278 pub fn is_finalized(&self) -> bool {
280 self.finalized
281 }
282
283 pub fn validate(&self) -> Result<(), String> {
285 if let Some(qual) = &self.qual
286 && qual.len() != self.seq.len()
287 {
288 return Err(format!(
289 "quality length ({}) does not match sequence length ({})",
290 qual.len(),
291 self.seq.len()
292 ));
293 }
294
295 let expected = self.read_len_from_ops() as usize;
296 if expected != self.seq.len() {
297 return Err(format!(
298 "read ops consume {expected} read bases, but sequence length is {}",
299 self.seq.len()
300 ));
301 }
302
303 Ok(())
304 }
305
306 pub fn read_len_from_ops(&self) -> u32 {
308 self.ops.last().map(|op| op.read_end()).unwrap_or(0)
309 }
310
311 pub fn ref_span(&self) -> Option<(u32, u32)> {
313 let start = self
314 .ops
315 .iter()
316 .filter(|op| op.kind.consumes_ref())
317 .map(|op| op.ref_start0)
318 .min()?;
319
320 let end = self
321 .ops
322 .iter()
323 .filter(|op| op.kind.consumes_ref())
324 .map(|op| op.ref_end0())
325 .max()?;
326
327 Some((start, end))
328 }
329
330 pub fn base_at_ref_pos(&self, pos0: u32) -> Option<ObservedBase> {
335 for op in &self.ops {
336 let read_pos = match op.read_pos_for_ref_pos(pos0) {
337 Some(read_pos) => read_pos,
338 None => continue,
339 };
340
341 let base = *self.seq.get(read_pos as usize)?;
342 let qual = self
343 .qual
344 .as_ref()
345 .and_then(|q| q.get(read_pos as usize).copied());
346
347 return Some(ObservedBase::new(base, qual, read_pos));
348 }
349
350 None
351 }
352
353 pub fn replace_ops(&mut self, ops_input: Vec<(ReadOpKind, u32)>) {
357 self.ops = Self::build_positioned_ops(self.ref_start0, &ops_input);
358 self.finalized = false;
359 }
360
361 pub fn op_pairs(&self) -> Vec<(ReadOpKind, u32)> {
363 self.ops.iter().map(|op| (op.kind, op.len)).collect()
364 }
365
366 pub fn uppercase_seq(seq: Vec<u8>) -> Vec<u8> {
368 seq.into_iter().map(|b| b.to_ascii_uppercase()).collect()
369 }
370}
371
372#[cfg(test)]
373mod tests {
374 use super::*;
375 use gtf_splice_index::types::RefBlock;
376 use crate::ReadOpKind;
377
378
379 impl AlignedRead {
380 fn simple_read() -> Self {
381 Self::new(
382 0,
383 Strand::Plus,
384 100,
385 b"ACGTACGTAA".to_vec(),
386 Some(vec![30; 10]),
387 vec![(ReadOpKind::Match, 10)],
388 )
389 }
390 }
391
392 #[test]
393 fn build_positioned_ops_tracks_coordinates() {
394 let ops = AlignedRead::build_positioned_ops(
395 100,
396 &[
397 (ReadOpKind::SoftClip, 5),
398 (ReadOpKind::Match, 10),
399 (ReadOpKind::Ins, 2),
400 (ReadOpKind::RefSkip, 100),
401 (ReadOpKind::Match, 15),
402 ],
403 );
404
405 assert_eq!(ops[0], ReadOp::new(ReadOpKind::SoftClip, 5, 100, 0));
406 assert_eq!(ops[1], ReadOp::new(ReadOpKind::Match, 10, 100, 5));
407 assert_eq!(ops[2], ReadOp::new(ReadOpKind::Ins, 2, 110, 15));
408 assert_eq!(ops[3], ReadOp::new(ReadOpKind::RefSkip, 100, 110, 17));
409 assert_eq!(ops[4], ReadOp::new(ReadOpKind::Match, 15, 210, 17));
410 }
411
412 #[test]
413 fn validate_accepts_consistent_read() {
414 let mut read = AlignedRead::simple_read();
415
416 assert!(read.finalize().is_ok());
417 assert!(read.is_finalized());
418 }
419
420 #[test]
421 fn validate_rejects_quality_length_mismatch() {
422 let read = AlignedRead::new(
423 0,
424 Strand::Plus,
425 100,
426 b"ACGT".to_vec(),
427 Some(vec![30; 3]),
428 vec![(ReadOpKind::Match, 4)],
429 );
430
431 assert!(read.validate().is_err());
432 }
433
434 #[test]
435 fn validate_rejects_sequence_length_mismatch() {
436 let read = AlignedRead::new(
437 0,
438 Strand::Plus,
439 100,
440 b"ACGT".to_vec(),
441 None,
442 vec![(ReadOpKind::Match, 3)],
443 );
444
445 assert!(read.validate().is_err());
446 }
447
448 #[test]
449 fn ref_span_includes_skips_and_deletions() {
450 let read = AlignedRead::new(
451 0,
452 Strand::Plus,
453 100,
454 b"AAAAATTTTT".to_vec(),
455 None,
456 vec![
457 (ReadOpKind::SoftClip, 5),
458 (ReadOpKind::Match, 5),
459 (ReadOpKind::RefSkip, 100),
460 (ReadOpKind::Match, 5),
461 ],
462 );
463
464 assert_eq!(read.ref_span(), Some((100, 210)));
465 }
466
467 #[test]
468 fn base_at_ref_pos_maps_match_positions() {
469 let read = AlignedRead::simple_read();
470
471 let obs = read.base_at_ref_pos(100).unwrap();
472 assert_eq!(obs.base, b'A');
473 assert_eq!(obs.qual, Some(30));
474 assert_eq!(obs.read_pos, 0);
475
476 let obs = read.base_at_ref_pos(103).unwrap();
477 assert_eq!(obs.base, b'T');
478 assert_eq!(obs.read_pos, 3);
479
480 assert!(read.base_at_ref_pos(99).is_none());
481 assert!(read.base_at_ref_pos(110).is_none());
482 }
483
484 #[test]
485 fn base_at_ref_pos_skips_deletions_and_refskips() {
486 let read = AlignedRead::new(
487 0,
488 Strand::Plus,
489 100,
490 b"AAAAACCCCC".to_vec(),
491 Some(vec![40; 10]),
492 vec![
493 (ReadOpKind::Match, 5),
494 (ReadOpKind::Del, 3),
495 (ReadOpKind::RefSkip, 100),
496 (ReadOpKind::Match, 5),
497 ],
498 );
499
500 assert!(read.base_at_ref_pos(102).is_some());
501 assert!(read.base_at_ref_pos(105).is_none());
502 assert!(read.base_at_ref_pos(108).is_none());
503 assert!(read.base_at_ref_pos(208).is_some());
504 }
505
506 #[test]
507 fn replace_ops_rebuilds_coordinates() {
508 let mut read = AlignedRead::simple_read();
509
510 read.replace_ops(vec![
511 (ReadOpKind::Match, 5),
512 (ReadOpKind::RefSkip, 100),
513 (ReadOpKind::Match, 5),
514 ]);
515
516 assert_eq!(read.ref_span(), Some((100, 210)));
517 assert!(!read.is_finalized());
518 }
519
520 #[test]
521 fn ref_blocks_merges_adjacent_aligned_ops() {
522 let read = AlignedRead::new(
523 0,
524 Strand::Plus,
525 100,
526 b"CCCCCCCCCCTGGGGGGGGGG".to_vec(),
527 Some(vec![30; 21]),
528 vec![
529 (ReadOpKind::Match, 10),
530 (ReadOpKind::Diff, 1),
531 (ReadOpKind::Match, 10),
532 ],
533 );
534
535 assert_eq!(
536 read.ref_blocks(),
537 vec![RefBlock {
538 start: 100,
539 end: 121,
540 }]
541 );
542 }
543
544 #[test]
545 fn ref_blocks_does_not_merge_across_refskip_or_deletion() {
546 let read = AlignedRead::new(
547 0,
548 Strand::Plus,
549 100,
550 b"CCCCCCCCCCGGGGGGGGGG".to_vec(),
551 Some(vec![30; 20]),
552 vec![
553 (ReadOpKind::Match, 10),
554 (ReadOpKind::RefSkip, 100),
555 (ReadOpKind::Match, 5),
556 (ReadOpKind::Del, 3),
557 (ReadOpKind::Match, 5),
558 ],
559 );
560
561 assert_eq!(
562 read.ref_blocks(),
563 vec![
564 RefBlock {
565 start: 100,
566 end: 110,
567 },
568 RefBlock {
569 start: 210,
570 end: 215,
571 },
572 RefBlock {
573 start: 218,
574 end: 223,
575 },
576 ]
577 );
578 }
579
580 #[test]
581 fn tp53_probe_positions_in_b05_read_are_not_covered() {
582 let read = AlignedRead::new(
583 0,
584 Strand::Minus,
585 7_359_184, vec![b'A'; 768],
587 Some(vec![30; 768]),
588 vec![
589 (ReadOpKind::SoftClip, 24),
590 (ReadOpKind::Match, 35),
591 (ReadOpKind::Del, 5),
592 (ReadOpKind::Match, 16),
593 (ReadOpKind::Del, 3),
594 (ReadOpKind::Match, 27),
595 (ReadOpKind::RefSkip, 236_919),
596 (ReadOpKind::Match, 138),
597 (ReadOpKind::Del, 1),
598 (ReadOpKind::Match, 22),
599 (ReadOpKind::Ins, 1),
600 (ReadOpKind::Match, 38),
601 (ReadOpKind::RefSkip, 90_635),
602 (ReadOpKind::Match, 213),
603 (ReadOpKind::Del, 1),
604 (ReadOpKind::Match, 167),
605 (ReadOpKind::Del, 1),
606 (ReadOpKind::Match, 6),
607 (ReadOpKind::Ins, 1),
608 (ReadOpKind::Match, 77),
609 (ReadOpKind::SoftClip, 4),
610 ],
611 );
612
613 assert!(read.base_at_ref_pos(7_675_994 - 1).is_none());
615 assert!(read.base_at_ref_pos(7_674_894 - 1).is_none());
616 assert!(read.base_at_ref_pos(7_674_953 - 1).is_none());
617 }
618
619 #[test]
620 fn refinement_can_relabel_match_equal_diff_without_breaking_coordinates() {
621 let mut read = AlignedRead::new(
622 0,
623 Strand::Plus,
624 100,
625 b"ACGTACGTAA".to_vec(),
626 Some(vec![30; 10]),
627 vec![(ReadOpKind::Match, 10)],
628 );
629
630 assert!(read.validate().is_ok());
631 assert_eq!(read.base_at_ref_pos(103).unwrap().base, b'T');
632
633 read.replace_ops(vec![
634 (ReadOpKind::Equal, 4),
635 (ReadOpKind::Diff, 1),
636 (ReadOpKind::Equal, 5),
637 ]);
638
639 assert!(read.validate().is_ok());
640 assert_eq!(read.base_at_ref_pos(103).unwrap().base, b'T');
641 assert_eq!(read.base_at_ref_pos(104).unwrap().base, b'A');
642 }
643}