Skip to main content

snp_index/read/
aligned_read.rs

1//! Aligned read representation used for SNP matching and refinement.
2//!
3//! This struct provides a normalized, sequence-aware view of an alignment,
4//! independent of the original input format. While it can be constructed from
5//! BAM records (`rust-htslib`), it is designed to act as a stable intermediate
6//! representation for downstream processing (e.g. SNP matching, refinement).
7
8use gtf_splice_index::types::RefBlock;
9use rust_htslib::bam::Record;
10use rust_htslib::bam::record::{Cigar, CigarStringView};
11
12/// Read strand/orientation.
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
14pub enum Strand {
15    Plus,
16    Minus,
17    Unknown,
18}
19
20/// CIGAR-like operation kind.
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum ReadOpKind {
23    Match,
24    Equal,
25    Diff,
26    Ins,
27    Del,
28    RefSkip,
29    SoftClip,
30    HardClip,
31    Pad,
32}
33
34/// One alignment operation with explicit reference/read coordinates.
35#[derive(Debug, Clone, Copy, PartialEq, Eq)]
36pub struct ReadOp {
37    pub kind: ReadOpKind,
38    pub len: u32,
39    pub ref_start0: u32,
40    pub read_start: u32,
41}
42
43/// A base observed in the read at a genomic reference position.
44#[derive(Debug, Clone, Copy, PartialEq, Eq)]
45pub struct ObservedBase {
46    pub base: u8,
47    pub qual: Option<u8>,
48    pub read_pos: u32,
49}
50
51/// BAM-independent aligned read.
52#[derive(Debug, Clone, PartialEq, Eq)]
53pub struct AlignedRead {
54    pub chr_id: usize,
55    pub strand: Strand,
56    pub ref_start0: u32,
57    pub seq: Vec<u8>,
58    pub qual: Option<Vec<u8>>,
59    pub ops: Vec<ReadOp>,
60    finalized: bool,
61}
62
63impl ReadOpKind {
64    /// Return true if this operation consumes reference coordinates.
65    pub fn consumes_ref(&self) -> bool {
66        matches!(
67            self,
68            Self::Match | Self::Equal | Self::Diff | Self::Del | Self::RefSkip
69        )
70    }
71
72    /// Return true if this operation consumes read/query coordinates.
73    pub fn consumes_read(&self) -> bool {
74        matches!(
75            self,
76            Self::Match | Self::Equal | Self::Diff | Self::Ins | Self::SoftClip
77        )
78    }
79
80    /// Return true if this operation has aligned read bases.
81    pub fn aligned_bases(&self) -> bool {
82        matches!(self, Self::Match | Self::Equal | Self::Diff)
83    }
84}
85
86impl ReadOp {
87    /// Create a positioned read operation.
88    pub fn new(kind: ReadOpKind, len: u32, ref_start0: u32, read_start: u32) -> Self {
89        Self {
90            kind,
91            len,
92            ref_start0,
93            read_start,
94        }
95    }
96
97    /// Return the exclusive reference end coordinate.
98    pub fn ref_end0(&self) -> u32 {
99        if self.kind.consumes_ref() {
100            self.ref_start0 + self.len
101        } else {
102            self.ref_start0
103        }
104    }
105
106    /// Return the exclusive read/query end coordinate.
107    pub fn read_end(&self) -> u32 {
108        if self.kind.consumes_read() {
109            self.read_start + self.len
110        } else {
111            self.read_start
112        }
113    }
114
115    /// Return true if this operation can yield a base for a reference position.
116    pub fn can_observe_reference_base(&self) -> bool {
117        self.kind.aligned_bases()
118    }
119
120    /// Return true if this operation covers `pos0` on the reference.
121    pub fn contains_ref_pos(&self, pos0: u32) -> bool {
122        self.kind.consumes_ref() && self.ref_start0 <= pos0 && pos0 < self.ref_end0()
123    }
124
125    /// Map reference coordinate to read coordinate for aligned-base operations.
126    pub fn read_pos_for_ref_pos(&self, pos0: u32) -> Option<u32> {
127        if !self.can_observe_reference_base() || !self.contains_ref_pos(pos0) {
128            return None;
129        }
130
131        Some(self.read_start + (pos0 - self.ref_start0))
132    }
133}
134
135impl ObservedBase {
136    /// Create a new observed base.
137    pub fn new(base: u8, qual: Option<u8>, read_pos: u32) -> Self {
138        Self {
139            base: base.to_ascii_uppercase(),
140            qual,
141            read_pos,
142        }
143    }
144}
145
146impl AlignedRead {
147    /// Create a new aligned read from raw pieces.
148    ///
149    /// `ops_input` contains only `(ReadOpKind, len)`. This constructor walks the
150    /// operations and fills reference/read coordinates.
151    pub fn new(
152        chr_id: usize,
153        strand: Strand,
154        ref_start0: u32,
155        seq: Vec<u8>,
156        qual: Option<Vec<u8>>,
157        ops_input: Vec<(ReadOpKind, u32)>,
158    ) -> Self {
159        let ops = Self::build_positioned_ops(ref_start0, &ops_input);
160
161        Self {
162            chr_id,
163            strand,
164            ref_start0,
165            seq: Self::uppercase_seq(seq),
166            qual,
167            ops,
168            finalized: false,
169        }
170    }
171
172    /// Build positioned operations from plain operation/length pairs.
173    pub fn build_positioned_ops(ref_start0: u32, ops_input: &[(ReadOpKind, u32)]) -> Vec<ReadOp> {
174        let mut ref_pos = ref_start0;
175        let mut read_pos = 0u32;
176        let mut ops = Vec::with_capacity(ops_input.len());
177
178        for (kind, len) in ops_input {
179            ops.push(ReadOp::new(*kind, *len, ref_pos, read_pos));
180
181            if kind.consumes_ref() {
182                ref_pos += *len;
183            }
184
185            if kind.consumes_read() {
186                read_pos += *len;
187            }
188        }
189
190        ops
191    }
192
193    /// Build an `AlignedRead` from a BAM record.
194    ///
195    /// `chr_id` must already be mapped into the same chromosome id space used by
196    /// `SnpIndex`.
197    pub fn from_record(record: &Record, chr_id: usize) -> Self {
198        let strand = if record.is_reverse() {
199            Strand::Minus
200        } else {
201            Strand::Plus
202        };
203
204        Self::new(
205            chr_id,
206            strand,
207            record.pos() as u32,
208            record.seq().as_bytes(),
209            Some(record.qual().to_vec()),
210            Self::cigar_to_read_ops(&record.cigar()),
211        )
212    }
213
214    /// Convert aligned read bases into genomic reference blocks.
215    ///
216    /// Only operations that contain aligned read bases are emitted:
217    /// `Match`, `Equal`, and `Diff`.
218    ///
219    /// Deletions and reference skips consume reference coordinates but do not
220    /// produce read-supported blocks.
221    pub fn ref_blocks(&self) -> Vec<RefBlock> {
222        let mut blocks: Vec<RefBlock> = Vec::new();
223
224        for op in &self.ops {
225            if !op.kind.aligned_bases() {
226                continue;
227            }
228
229            let new_block = RefBlock {
230                start: op.ref_start0,
231                end: op.ref_end0(),
232            };
233
234            match blocks.last_mut() {
235                Some(last) if new_block.start <= last.end => {
236                    // Merge adjacent or overlapping (robust, even if overlap shouldn't happen)
237                    last.end = last.end.max(new_block.end);
238                }
239                _ => blocks.push(new_block),
240            }
241        }
242
243        blocks
244    }
245
246    /// Convert a BAM CIGAR into `AlignedRead` operation pairs.
247    fn cigar_to_read_ops(cigar: &CigarStringView) -> Vec<(ReadOpKind, u32)> {
248        cigar
249            .iter()
250            .map(|op| Self::cigar_op_to_read_op(*op))
251            .collect()
252    }
253
254    /// Convert one BAM CIGAR op into a `ReadOpKind`.
255    pub fn cigar_op_to_read_op(op: Cigar) -> (ReadOpKind, u32) {
256        match op {
257            Cigar::Match(len) => (ReadOpKind::Match, len),
258            Cigar::Ins(len) => (ReadOpKind::Ins, len),
259            Cigar::Del(len) => (ReadOpKind::Del, len),
260            Cigar::RefSkip(len) => (ReadOpKind::RefSkip, len),
261            Cigar::SoftClip(len) => (ReadOpKind::SoftClip, len),
262            Cigar::HardClip(len) => (ReadOpKind::HardClip, len),
263            Cigar::Pad(len) => (ReadOpKind::Pad, len),
264            Cigar::Equal(len) => (ReadOpKind::Equal, len),
265            Cigar::Diff(len) => (ReadOpKind::Diff, len),
266        }
267    }
268
269    /// Validate the read and mark it finalized.
270    ///
271    /// Later refinement steps can also call this after changing operations.
272    pub fn finalize(&mut self) -> Result<(), String> {
273        self.validate()?;
274        self.finalized = true;
275        Ok(())
276    }
277
278    /// Return whether this read has been finalized.
279    pub fn is_finalized(&self) -> bool {
280        self.finalized
281    }
282
283    /// Validate sequence, quality, and operation consistency.
284    pub fn validate(&self) -> Result<(), String> {
285        if let Some(qual) = &self.qual
286            && qual.len() != self.seq.len()
287        {
288            return Err(format!(
289                "quality length ({}) does not match sequence length ({})",
290                qual.len(),
291                self.seq.len()
292            ));
293        }
294
295        let expected = self.read_len_from_ops() as usize;
296        if expected != self.seq.len() {
297            return Err(format!(
298                "read ops consume {expected} read bases, but sequence length is {}",
299                self.seq.len()
300            ));
301        }
302
303        Ok(())
304    }
305
306    /// Return read length implied by operations.
307    pub fn read_len_from_ops(&self) -> u32 {
308        self.ops.last().map(|op| op.read_end()).unwrap_or(0)
309    }
310
311    /// Return full reference span `[start0, end0)`.
312    pub fn ref_span(&self) -> Option<(u32, u32)> {
313        let start = self
314            .ops
315            .iter()
316            .filter(|op| op.kind.consumes_ref())
317            .map(|op| op.ref_start0)
318            .min()?;
319
320        let end = self
321            .ops
322            .iter()
323            .filter(|op| op.kind.consumes_ref())
324            .map(|op| op.ref_end0())
325            .max()?;
326
327        Some((start, end))
328    }
329
330    /// Return observed base at reference position `pos0`.
331    ///
332    /// Returns `None` for deletions, ref-skips, insertions, clips, pads, and
333    /// positions outside the alignment.
334    pub fn base_at_ref_pos(&self, pos0: u32) -> Option<ObservedBase> {
335        for op in &self.ops {
336            let read_pos = match op.read_pos_for_ref_pos(pos0) {
337                Some(read_pos) => read_pos,
338                None => continue,
339            };
340
341            let base = *self.seq.get(read_pos as usize)?;
342            let qual = self
343                .qual
344                .as_ref()
345                .and_then(|q| q.get(read_pos as usize).copied());
346
347            return Some(ObservedBase::new(base, qual, read_pos));
348        }
349
350        None
351    }
352
353    /// Replace operations from plain operation/length pairs.
354    ///
355    /// This is useful for refinement modules that rewrite CIGAR-like structure.
356    pub fn replace_ops(&mut self, ops_input: Vec<(ReadOpKind, u32)>) {
357        self.ops = Self::build_positioned_ops(self.ref_start0, &ops_input);
358        self.finalized = false;
359    }
360
361    /// Return operations as plain `(kind, len)` pairs.
362    pub fn op_pairs(&self) -> Vec<(ReadOpKind, u32)> {
363        self.ops.iter().map(|op| (op.kind, op.len)).collect()
364    }
365
366    /// Uppercase sequence bases.
367    pub fn uppercase_seq(seq: Vec<u8>) -> Vec<u8> {
368        seq.into_iter().map(|b| b.to_ascii_uppercase()).collect()
369    }
370}
371
372#[cfg(test)]
373mod tests {
374    use super::*;
375    use gtf_splice_index::types::RefBlock;
376    use crate::ReadOpKind;
377
378
379    impl AlignedRead {
380        fn simple_read() -> Self {
381            Self::new(
382                0,
383                Strand::Plus,
384                100,
385                b"ACGTACGTAA".to_vec(),
386                Some(vec![30; 10]),
387                vec![(ReadOpKind::Match, 10)],
388            )
389        }
390    }
391
392    #[test]
393    fn build_positioned_ops_tracks_coordinates() {
394        let ops = AlignedRead::build_positioned_ops(
395            100,
396            &[
397                (ReadOpKind::SoftClip, 5),
398                (ReadOpKind::Match, 10),
399                (ReadOpKind::Ins, 2),
400                (ReadOpKind::RefSkip, 100),
401                (ReadOpKind::Match, 15),
402            ],
403        );
404
405        assert_eq!(ops[0], ReadOp::new(ReadOpKind::SoftClip, 5, 100, 0));
406        assert_eq!(ops[1], ReadOp::new(ReadOpKind::Match, 10, 100, 5));
407        assert_eq!(ops[2], ReadOp::new(ReadOpKind::Ins, 2, 110, 15));
408        assert_eq!(ops[3], ReadOp::new(ReadOpKind::RefSkip, 100, 110, 17));
409        assert_eq!(ops[4], ReadOp::new(ReadOpKind::Match, 15, 210, 17));
410    }
411
412    #[test]
413    fn validate_accepts_consistent_read() {
414        let mut read = AlignedRead::simple_read();
415
416        assert!(read.finalize().is_ok());
417        assert!(read.is_finalized());
418    }
419
420    #[test]
421    fn validate_rejects_quality_length_mismatch() {
422        let read = AlignedRead::new(
423            0,
424            Strand::Plus,
425            100,
426            b"ACGT".to_vec(),
427            Some(vec![30; 3]),
428            vec![(ReadOpKind::Match, 4)],
429        );
430
431        assert!(read.validate().is_err());
432    }
433
434    #[test]
435    fn validate_rejects_sequence_length_mismatch() {
436        let read = AlignedRead::new(
437            0,
438            Strand::Plus,
439            100,
440            b"ACGT".to_vec(),
441            None,
442            vec![(ReadOpKind::Match, 3)],
443        );
444
445        assert!(read.validate().is_err());
446    }
447
448    #[test]
449    fn ref_span_includes_skips_and_deletions() {
450        let read = AlignedRead::new(
451            0,
452            Strand::Plus,
453            100,
454            b"AAAAATTTTT".to_vec(),
455            None,
456            vec![
457                (ReadOpKind::SoftClip, 5),
458                (ReadOpKind::Match, 5),
459                (ReadOpKind::RefSkip, 100),
460                (ReadOpKind::Match, 5),
461            ],
462        );
463
464        assert_eq!(read.ref_span(), Some((100, 210)));
465    }
466
467    #[test]
468    fn base_at_ref_pos_maps_match_positions() {
469        let read = AlignedRead::simple_read();
470
471        let obs = read.base_at_ref_pos(100).unwrap();
472        assert_eq!(obs.base, b'A');
473        assert_eq!(obs.qual, Some(30));
474        assert_eq!(obs.read_pos, 0);
475
476        let obs = read.base_at_ref_pos(103).unwrap();
477        assert_eq!(obs.base, b'T');
478        assert_eq!(obs.read_pos, 3);
479
480        assert!(read.base_at_ref_pos(99).is_none());
481        assert!(read.base_at_ref_pos(110).is_none());
482    }
483
484    #[test]
485    fn base_at_ref_pos_skips_deletions_and_refskips() {
486        let read = AlignedRead::new(
487            0,
488            Strand::Plus,
489            100,
490            b"AAAAACCCCC".to_vec(),
491            Some(vec![40; 10]),
492            vec![
493                (ReadOpKind::Match, 5),
494                (ReadOpKind::Del, 3),
495                (ReadOpKind::RefSkip, 100),
496                (ReadOpKind::Match, 5),
497            ],
498        );
499
500        assert!(read.base_at_ref_pos(102).is_some());
501        assert!(read.base_at_ref_pos(105).is_none());
502        assert!(read.base_at_ref_pos(108).is_none());
503        assert!(read.base_at_ref_pos(208).is_some());
504    }
505
506    #[test]
507    fn replace_ops_rebuilds_coordinates() {
508        let mut read = AlignedRead::simple_read();
509
510        read.replace_ops(vec![
511            (ReadOpKind::Match, 5),
512            (ReadOpKind::RefSkip, 100),
513            (ReadOpKind::Match, 5),
514        ]);
515
516        assert_eq!(read.ref_span(), Some((100, 210)));
517        assert!(!read.is_finalized());
518    }
519
520    #[test]
521    fn ref_blocks_merges_adjacent_aligned_ops() {
522        let read = AlignedRead::new(
523            0,
524            Strand::Plus,
525            100,
526            b"CCCCCCCCCCTGGGGGGGGGG".to_vec(),
527            Some(vec![30; 21]),
528            vec![
529                (ReadOpKind::Match, 10),
530                (ReadOpKind::Diff, 1),
531                (ReadOpKind::Match, 10),
532            ],
533        );
534
535        assert_eq!(
536            read.ref_blocks(),
537            vec![RefBlock {
538                start: 100,
539                end: 121,
540            }]
541        );
542    }
543
544    #[test]
545    fn ref_blocks_does_not_merge_across_refskip_or_deletion() {
546        let read = AlignedRead::new(
547            0,
548            Strand::Plus,
549            100,
550            b"CCCCCCCCCCGGGGGGGGGG".to_vec(),
551            Some(vec![30; 20]),
552            vec![
553                (ReadOpKind::Match, 10),
554                (ReadOpKind::RefSkip, 100),
555                (ReadOpKind::Match, 5),
556                (ReadOpKind::Del, 3),
557                (ReadOpKind::Match, 5),
558            ],
559        );
560
561        assert_eq!(
562            read.ref_blocks(),
563            vec![
564                RefBlock {
565                    start: 100,
566                    end: 110,
567                },
568                RefBlock {
569                    start: 210,
570                    end: 215,
571                },
572                RefBlock {
573                    start: 218,
574                    end: 223,
575                },
576            ]
577        );
578    }
579
580    #[test]
581    fn tp53_probe_positions_in_b05_read_are_not_covered() {
582        let read = AlignedRead::new(
583            0,
584            Strand::Minus,
585            7_359_184, // SAM POS 7359185 -> 0-based
586            vec![b'A'; 768],
587            Some(vec![30; 768]),
588            vec![
589                (ReadOpKind::SoftClip, 24),
590                (ReadOpKind::Match, 35),
591                (ReadOpKind::Del, 5),
592                (ReadOpKind::Match, 16),
593                (ReadOpKind::Del, 3),
594                (ReadOpKind::Match, 27),
595                (ReadOpKind::RefSkip, 236_919),
596                (ReadOpKind::Match, 138),
597                (ReadOpKind::Del, 1),
598                (ReadOpKind::Match, 22),
599                (ReadOpKind::Ins, 1),
600                (ReadOpKind::Match, 38),
601                (ReadOpKind::RefSkip, 90_635),
602                (ReadOpKind::Match, 213),
603                (ReadOpKind::Del, 1),
604                (ReadOpKind::Match, 167),
605                (ReadOpKind::Del, 1),
606                (ReadOpKind::Match, 6),
607                (ReadOpKind::Ins, 1),
608                (ReadOpKind::Match, 77),
609                (ReadOpKind::SoftClip, 4),
610            ],
611        );
612
613        // VCF positions are 1-based; base_at_ref_pos uses 0-based.
614        assert!(read.base_at_ref_pos(7_675_994 - 1).is_none());
615        assert!(read.base_at_ref_pos(7_674_894 - 1).is_none());
616        assert!(read.base_at_ref_pos(7_674_953 - 1).is_none());
617    }
618
619    #[test]
620    fn refinement_can_relabel_match_equal_diff_without_breaking_coordinates() {
621        let mut read = AlignedRead::new(
622            0,
623            Strand::Plus,
624            100,
625            b"ACGTACGTAA".to_vec(),
626            Some(vec![30; 10]),
627            vec![(ReadOpKind::Match, 10)],
628        );
629
630        assert!(read.validate().is_ok());
631        assert_eq!(read.base_at_ref_pos(103).unwrap().base, b'T');
632
633        read.replace_ops(vec![
634            (ReadOpKind::Equal, 4),
635            (ReadOpKind::Diff, 1),
636            (ReadOpKind::Equal, 5),
637        ]);
638
639        assert!(read.validate().is_ok());
640        assert_eq!(read.base_at_ref_pos(103).unwrap().base, b'T');
641        assert_eq!(read.base_at_ref_pos(104).unwrap().base, b'A');
642    }
643}