Skip to main content

fibertools_rs/utils/
bamannotations.rs

1use crate::utils::bamlift::*;
2use rust_htslib::bam;
3use rust_htslib::bam::ext::BamRecordExtensions;
4
5#[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd)]
6pub struct FiberAnnotation {
7    pub start: i64,
8    pub end: i64,
9    pub length: i64,
10    pub qual: u8,
11    pub reference_start: Option<i64>,
12    pub reference_end: Option<i64>,
13    pub reference_length: Option<i64>,
14    pub extra_columns: Option<Vec<String>>,
15}
16
17#[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd)]
18pub struct FiberAnnotations {
19    pub annotations: Vec<FiberAnnotation>,
20    pub seq_len: i64,
21    pub reverse: bool,
22}
23
24// Keep Ranges as an alias for backward compatibility
25pub type Ranges = FiberAnnotations;
26
27impl FiberAnnotations {
28    /// Create FiberAnnotations from a vector of FiberAnnotation items
29    pub fn from_annotations(
30        mut annotations: Vec<FiberAnnotation>,
31        seq_len: i64,
32        reverse: bool,
33    ) -> Self {
34        // Sort annotations by start position to ensure they are always in order
35        annotations.sort_by_key(|a| a.start);
36
37        Self {
38            annotations,
39            seq_len,
40            reverse,
41        }
42    }
43
44    /// starts and ends are [) intervals.
45    pub fn new(
46        record: &bam::Record,
47        mut forward_starts: Vec<i64>,
48        forward_ends: Option<Vec<i64>>,
49        mut lengths: Option<Vec<i64>>,
50    ) -> Self {
51        let mut single_bp_liftover = false;
52        // assume ends == starts if not provided
53        if forward_ends.is_none() && lengths.is_none() {
54            lengths = Some(vec![1; forward_starts.len()]);
55            single_bp_liftover = true;
56        }
57
58        // use ends or calculate them
59        let mut forward_ends_inclusive: Vec<i64> = match forward_ends {
60            Some(x) => x.into_iter().map(|x| x + 1).collect(),
61            None => forward_starts
62                .iter()
63                .zip(lengths.unwrap().iter())
64                .map(|(&x, &y)| x + y - 1)
65                .collect(),
66        };
67
68        // bam features for finding aligned positions
69        let is_reverse = record.is_reverse();
70        let seq_len = i64::try_from(record.seq_len()).unwrap();
71
72        // get positions and lengths in reference orientation
73        Self::positions_on_aligned_sequence(&mut forward_starts, is_reverse, seq_len);
74        Self::positions_on_aligned_sequence(&mut forward_ends_inclusive, is_reverse, seq_len);
75        let mut starts = forward_starts;
76        let mut ends = forward_ends_inclusive;
77
78        // swaps starts and ends if we reverse complemented
79        if record.is_reverse() {
80            std::mem::swap(&mut starts, &mut ends);
81        }
82
83        // swap back to non-inclusive ends
84        ends = ends.into_iter().map(|x| x + 1).collect();
85
86        // get lengths
87        let lengths = starts
88            .iter()
89            .zip(ends.iter())
90            .map(|(&x, &y)| Some(y - x))
91            .collect::<Vec<_>>();
92
93        let (reference_starts, reference_ends, reference_lengths) = if single_bp_liftover {
94            lift_query_range_exact(record, &starts, &starts)
95        } else {
96            lift_query_range(record, &starts, &ends)
97        }
98        .unwrap_or_else(|e| {
99            log::error!(
100                "Failed lifting over annotations in BAM record: {} aligned from {} to {}.",
101                String::from_utf8_lossy(record.qname()),
102                record.reference_start() + 1,
103                record.reference_end()
104            );
105            log::error!("Failed to lift query range: {}", e);
106            panic!("Failed to lift query range: {}", e);
107        });
108
109        // create annotations from parallel vectors
110        let mut annotations: Vec<FiberAnnotation> = starts
111            .into_iter()
112            .zip(ends)
113            .zip(lengths)
114            .zip(reference_starts)
115            .zip(reference_ends)
116            .zip(reference_lengths)
117            .map(
118                |(((((start, end), length), ref_start), ref_end), ref_length)| FiberAnnotation {
119                    start,
120                    end,
121                    length: length.unwrap_or(end - start),
122                    qual: 0,
123                    reference_start: ref_start,
124                    reference_end: ref_end,
125                    reference_length: ref_length,
126                    extra_columns: None,
127                },
128            )
129            .collect();
130
131        // Sort annotations by start position to ensure they are always in order
132        annotations.sort_by_key(|a| a.start);
133
134        // return object
135        FiberAnnotations {
136            annotations,
137            seq_len,
138            reverse: is_reverse,
139        }
140    }
141
142    pub fn set_qual(&mut self, mut forward_qual: Vec<u8>) {
143        assert_eq!(forward_qual.len(), self.annotations.len());
144        // flip if we are on the reverse strand
145        if self.reverse {
146            forward_qual.reverse();
147        }
148
149        for (annotation, &q) in self.annotations.iter_mut().zip(forward_qual.iter()) {
150            annotation.qual = q;
151        }
152    }
153
154    // Backward compatibility methods
155    pub fn starts(&self) -> Vec<i64> {
156        self.annotations.iter().map(|a| a.start).collect()
157    }
158
159    pub fn ends(&self) -> Vec<i64> {
160        self.annotations.iter().map(|a| a.end).collect()
161    }
162
163    pub fn lengths(&self) -> Vec<i64> {
164        self.annotations.iter().map(|a| a.length).collect()
165    }
166
167    // Option versions for consistency with reference methods
168    pub fn option_starts(&self) -> Vec<Option<i64>> {
169        self.annotations.iter().map(|a| Some(a.start)).collect()
170    }
171    pub fn option_ends(&self) -> Vec<Option<i64>> {
172        self.annotations.iter().map(|a| Some(a.end)).collect()
173    }
174    pub fn option_lengths(&self) -> Vec<Option<i64>> {
175        self.annotations.iter().map(|a| Some(a.length)).collect()
176    }
177
178    pub fn qual(&self) -> Vec<u8> {
179        self.annotations.iter().map(|a| a.qual).collect()
180    }
181
182    pub fn reference_starts(&self) -> Vec<Option<i64>> {
183        self.annotations.iter().map(|a| a.reference_start).collect()
184    }
185
186    pub fn reference_ends(&self) -> Vec<Option<i64>> {
187        self.annotations.iter().map(|a| a.reference_end).collect()
188    }
189
190    pub fn reference_lengths(&self) -> Vec<Option<i64>> {
191        self.annotations
192            .iter()
193            .map(|a| a.reference_length)
194            .collect()
195    }
196
197    /// get positions on the complimented sequence in the cigar record
198    fn positions_on_aligned_sequence(input_positions: &mut [i64], is_reverse: bool, seq_len: i64) {
199        if !is_reverse {
200            return;
201        }
202        //need to correct for going from [) to (] if we are part of a range
203        for p in input_positions.iter_mut() {
204            *p = seq_len - *p - 1;
205        }
206        input_positions.reverse();
207    }
208
209    /// get the molecular coordinates of the ranges, taking into account
210    /// the alignment orientation
211    pub fn get_molecular(&self) -> Vec<Option<(i64, i64, i64)>> {
212        self.annotations
213            .iter()
214            .map(|annotation| Some((annotation.start, annotation.end, annotation.length)))
215            .collect()
216    }
217
218    pub fn forward_starts(&self) -> Vec<i64> {
219        if !self.reverse {
220            // For forward reads, just return the starts as-is
221            self.starts()
222        } else {
223            // For reverse reads, we need to convert back to forward coordinates
224            // The stored starts are in reverse-complement coordinates and in reverse order
225            // We need to undo both transformations to get back to original forward coordinates
226
227            // First collect all transformations (reverse-complement back to forward)
228            let mut forward_starts: Vec<i64> = self
229                .annotations
230                .iter()
231                .map(|a| self.seq_len - a.start - 1)
232                .collect();
233
234            // Then reverse the order to undo the array reversal that was done during storage
235            forward_starts.reverse();
236            forward_starts
237        }
238    }
239
240    pub fn get_forward_quals(&self) -> Vec<u8> {
241        let mut forward: Vec<u8> = self.annotations.iter().map(|a| a.qual).collect();
242        if self.reverse {
243            forward.reverse();
244        }
245        forward
246    }
247
248    // filter out ranges that are less than the passed quality score
249    pub fn filter_by_qual(&mut self, min_qual: u8) {
250        self.annotations
251            .retain(|annotation| annotation.qual >= min_qual);
252    }
253
254    /// filter out ranges that are within the first or last X bp of the read
255    pub fn filter_starts_at_read_ends(&mut self, strip: i64) {
256        if strip == 0 {
257            return;
258        }
259
260        let original_len = self.annotations.len();
261        self.annotations.retain(|annotation| {
262            annotation.start >= strip && annotation.start <= self.seq_len - strip
263        });
264
265        if self.annotations.len() != original_len {
266            log::trace!(
267                "basemods stripped, {} basemods removed",
268                original_len - self.annotations.len()
269            );
270        }
271    }
272
273    pub fn to_strings(&self, reference: bool, skip_none: bool) -> Vec<String> {
274        let (s, e, l, q) = if reference {
275            (
276                self.reference_starts(),
277                self.reference_ends(),
278                self.reference_lengths(),
279                self.qual(),
280            )
281        } else {
282            (
283                self.option_starts(),
284                self.option_ends(),
285                self.option_lengths(),
286                self.qual(),
287            )
288        };
289
290        let s = crate::join_by_str_option_can_skip(&s, ",", skip_none);
291        let e = crate::join_by_str_option_can_skip(&e, ",", skip_none);
292        let l = crate::join_by_str_option_can_skip(&l, ",", skip_none);
293        if reference {
294            vec![s, e, l]
295        } else {
296            let q = crate::join_by_str(&q, ",");
297            vec![s, e, l, q]
298        }
299    }
300
301    /// get the reference coordinates of the ranges, taking into account
302    /// the alignment orientation
303    pub fn get_reference(&self) -> Vec<Option<(i64, i64, i64)>> {
304        self.annotations
305            .iter()
306            .map(|annotation| {
307                if let (Some(s), Some(e), Some(l)) = (
308                    annotation.reference_start,
309                    annotation.reference_end,
310                    annotation.reference_length,
311                ) {
312                    Some((s, e, l))
313                } else {
314                    None
315                }
316            })
317            .collect()
318    }
319
320    pub fn merge_ranges(multiple_ranges: Vec<&Self>) -> Self {
321        if multiple_ranges.is_empty() {
322            return Self::from_annotations(vec![], 0, false);
323        }
324
325        // check properties that must be the same
326        let reverse = multiple_ranges[0].reverse;
327        let seq_len = multiple_ranges[0].seq_len;
328        for r in multiple_ranges.iter() {
329            assert_eq!(r.reverse, reverse);
330            assert_eq!(r.seq_len, seq_len);
331        }
332
333        // collect all annotations
334        let annotations: Vec<FiberAnnotation> = multiple_ranges
335            .iter()
336            .flat_map(|r| r.annotations.clone())
337            .collect();
338
339        // Use from_annotations to ensure proper sorting
340        Self::from_annotations(annotations, seq_len, reverse)
341    }
342
343    fn apply_offset_helper(in_start: i64, in_end: i64, offset: i64, strand: char) -> (i64, i64) {
344        let mut start = in_start - offset;
345        let mut end = in_end - offset - 1; // make the end inclusive for
346        if strand == '-' {
347            start = -start;
348            end = -end;
349
350            // Swap start and end if we reverse complemented
351            if start > end {
352                std::mem::swap(&mut start, &mut end);
353            }
354        }
355        // make the end exclusive again
356        end += 1;
357        (start, end)
358    }
359
360    /// Apply offset to all molecular coordinates in the annotations
361    pub fn apply_offset(&mut self, offset: i64, ref_offset: i64, strand: char) {
362        for annotation in &mut self.annotations {
363            let (new_start, new_end) =
364                Self::apply_offset_helper(annotation.start, annotation.end, offset, strand);
365            annotation.start = new_start;
366            annotation.end = new_end;
367            assert!(
368                annotation.end - annotation.start == annotation.length,
369                "Annotation length mismatch after centering: {} != {} - {}",
370                annotation.length,
371                annotation.end,
372                annotation.start
373            );
374
375            // Apply offset to reference coordinates if they exist
376            if let (Some(ref_start), Some(ref_end)) =
377                (annotation.reference_start, annotation.reference_end)
378            {
379                let (new_ref_start, new_ref_end) =
380                    Self::apply_offset_helper(ref_start, ref_end, ref_offset, strand);
381                annotation.reference_start = Some(new_ref_start);
382                annotation.reference_end = Some(new_ref_end);
383                annotation.reference_length = Some(new_ref_end - new_ref_start + 1);
384            }
385        }
386        // reverse the annotations if we are on the reverse strand
387        if strand == '-' {
388            self.annotations.reverse();
389        }
390    }
391
392    /// Create FiberAnnotations from BAM tags with configurable tag names
393    pub fn from_bam_tags(
394        record: &rust_htslib::bam::Record,
395        start_tag: &[u8; 2],
396        length_tag: &[u8; 2],
397        annotation_tag: Option<&[u8; 2]>,
398    ) -> anyhow::Result<Option<Self>> {
399        // Check if record has the specified start and length tags
400        if let (Ok(start_aux), Ok(length_aux)) = (record.aux(start_tag), record.aux(length_tag)) {
401            if let (
402                rust_htslib::bam::record::Aux::ArrayU32(start_array),
403                rust_htslib::bam::record::Aux::ArrayU32(length_array),
404            ) = (start_aux, length_aux)
405            {
406                let start_values: Vec<u32> = start_array.iter().collect();
407                let length_values: Vec<u32> = length_array.iter().collect();
408
409                if start_values.len() != length_values.len() {
410                    return Err(anyhow::anyhow!(
411                        "Mismatched {} and {} array lengths",
412                        String::from_utf8_lossy(start_tag),
413                        String::from_utf8_lossy(length_tag)
414                    ));
415                }
416
417                // Convert to i64 vectors for FiberAnnotations::new
418                let forward_starts: Vec<i64> = start_values.iter().map(|&x| x as i64).collect();
419                let lengths: Vec<i64> = length_values.iter().map(|&x| x as i64).collect();
420
421                // Get annotation tag for extra columns if specified
422                let annotation_values = if let Some(ann_tag) = annotation_tag {
423                    if let Ok(rust_htslib::bam::record::Aux::String(ann_string)) =
424                        record.aux(ann_tag)
425                    {
426                        Some(ann_string.split('|').collect::<Vec<_>>())
427                    } else {
428                        None
429                    }
430                } else {
431                    None
432                };
433
434                // Use the existing FiberAnnotations::new method
435                let mut fiber_annotations = FiberAnnotations::new(
436                    record,
437                    forward_starts,
438                    None,          // no ends provided
439                    Some(lengths), // use lengths from fl tag
440                );
441
442                // Add extra columns to the annotations if present
443                if let Some(ann_vals) = annotation_values {
444                    for (i, annotation) in fiber_annotations.annotations.iter_mut().enumerate() {
445                        if i < ann_vals.len() && !ann_vals[i].is_empty() {
446                            annotation.extra_columns =
447                                Some(ann_vals[i].split(';').map(|s| s.to_string()).collect());
448                        }
449                    }
450                }
451
452                Ok(Some(fiber_annotations))
453            } else {
454                Ok(None) // Tags exist but wrong type
455            }
456        } else {
457            Ok(None) // No annotation tags
458        }
459    }
460
461    /// Create FiberAnnotations containing only annotations that overlap with the given range
462    pub fn overlapping_annotations(&self, range_start: i64, range_end: i64) -> Self {
463        let mut overlapping = Vec::new();
464
465        for annotation in &self.annotations {
466            // Check if annotation overlaps with range
467            if annotation.end > range_start && annotation.start < range_end {
468                overlapping.push(annotation.clone());
469            }
470        }
471
472        FiberAnnotations::from_annotations(overlapping, range_end - range_start, self.reverse)
473    }
474
475    /// Write annotations to BAM record as auxiliary tags
476    pub fn write_to_bam_tags(
477        &self,
478        record: &mut rust_htslib::bam::Record,
479        start_tag: &[u8; 2],
480        length_tag: &[u8; 2],
481        annotation_tag: Option<&[u8; 2]>,
482    ) -> anyhow::Result<()> {
483        use anyhow::Context;
484
485        if self.annotations.is_empty() {
486            return Ok(());
487        }
488
489        // Collect starts and lengths
490        let starts: Vec<u32> = self.annotations.iter().map(|a| a.start as u32).collect();
491
492        let lengths: Vec<u32> = self.annotations.iter().map(|a| a.length as u32).collect();
493
494        // Add start positions tag
495        record
496            .push_aux(
497                start_tag,
498                rust_htslib::bam::record::Aux::ArrayU32((&starts).into()),
499            )
500            .with_context(|| format!("Failed to add {} tag", String::from_utf8_lossy(start_tag)))?;
501
502        // Add lengths tag
503        record
504            .push_aux(
505                length_tag,
506                rust_htslib::bam::record::Aux::ArrayU32((&lengths).into()),
507            )
508            .with_context(|| {
509                format!("Failed to add {} tag", String::from_utf8_lossy(length_tag))
510            })?;
511
512        // Add annotation tag if requested and any annotations have extra columns
513        if let Some(ann_tag) = annotation_tag {
514            let extra_strings: Vec<String> = self
515                .annotations
516                .iter()
517                .map(|a| {
518                    if let Some(ref extra_cols) = a.extra_columns {
519                        extra_cols.join(";")
520                    } else {
521                        String::new()
522                    }
523                })
524                .collect();
525
526            // Only add the tag if there are non-empty extra columns
527            if extra_strings.iter().any(|s| !s.is_empty()) {
528                let joined_extra = extra_strings.join("|");
529                record
530                    .push_aux(
531                        ann_tag,
532                        rust_htslib::bam::record::Aux::String(&joined_extra),
533                    )
534                    .with_context(|| {
535                        format!("Failed to add {} tag", String::from_utf8_lossy(ann_tag))
536                    })?;
537            }
538        }
539        Ok(())
540    }
541}
542
543impl<'a> IntoIterator for &'a FiberAnnotations {
544    type Item = &'a FiberAnnotation;
545    type IntoIter = FiberAnnotationsIterator<'a>;
546
547    fn into_iter(self) -> Self::IntoIter {
548        FiberAnnotationsIterator {
549            annotations: self,
550            index: 0,
551        }
552    }
553}
554
555pub struct FiberAnnotationsIterator<'a> {
556    annotations: &'a FiberAnnotations,
557    index: usize,
558}
559
560impl<'a> Iterator for FiberAnnotationsIterator<'a> {
561    type Item = &'a FiberAnnotation;
562    fn next(&mut self) -> Option<Self::Item> {
563        if self.index >= self.annotations.annotations.len() {
564            return None;
565        }
566        let annotation = &self.annotations.annotations[self.index];
567        self.index += 1;
568        Some(annotation)
569    }
570}
571
572// Backward compatibility alias
573pub type RangesIterator<'a> = FiberAnnotationsIterator<'a>;