fibertools_rs/utils/
bamranges.rs

1use crate::utils::bamlift::*;
2use itertools::{izip, multiunzip};
3use rust_htslib::bam;
4
5#[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd)]
6pub struct Ranges {
7    pub starts: Vec<Option<i64>>,
8    pub ends: Vec<Option<i64>>,
9    pub lengths: Vec<Option<i64>>,
10    pub qual: Vec<u8>,
11    pub reference_starts: Vec<Option<i64>>,
12    pub reference_ends: Vec<Option<i64>>,
13    pub reference_lengths: Vec<Option<i64>>,
14    pub seq_len: i64,
15    pub reverse: bool,
16}
17
18impl Ranges {
19    /// starts and ends are [) intervals.
20    pub fn new(
21        record: &bam::Record,
22        mut forward_starts: Vec<i64>,
23        forward_ends: Option<Vec<i64>>,
24        mut lengths: Option<Vec<i64>>,
25    ) -> Self {
26        let mut single_bp_liftover = false;
27        // assume ends == starts if not provided
28        if forward_ends.is_none() && lengths.is_none() {
29            lengths = Some(vec![1; forward_starts.len()]);
30            single_bp_liftover = true;
31        }
32
33        // use ends or calculate them
34        let mut forward_ends_inclusive: Vec<i64> = match forward_ends {
35            Some(x) => x.into_iter().map(|x| x + 1).collect(),
36            None => forward_starts
37                .iter()
38                .zip(lengths.unwrap().iter())
39                .map(|(&x, &y)| x + y - 1)
40                .collect(),
41        };
42
43        // bam features for finding aligned positions
44        let is_reverse = record.is_reverse();
45        let seq_len = i64::try_from(record.seq_len()).unwrap();
46
47        // get positions and lengths in reference orientation
48        Self::positions_on_aligned_sequence(&mut forward_starts, is_reverse, seq_len);
49        Self::positions_on_aligned_sequence(&mut forward_ends_inclusive, is_reverse, seq_len);
50        let mut starts = forward_starts;
51        let mut ends = forward_ends_inclusive;
52
53        // swaps starts and ends if we reverse complemented
54        if record.is_reverse() {
55            std::mem::swap(&mut starts, &mut ends);
56        }
57
58        // swap back to non-inclusive ends
59        ends = ends.into_iter().map(|x| x + 1).collect();
60
61        // get lengths
62        let lengths = starts
63            .iter()
64            .zip(ends.iter())
65            .map(|(&x, &y)| Some(y - x))
66            .collect::<Vec<_>>();
67
68        let (reference_starts, reference_ends, reference_lengths) = if single_bp_liftover {
69            lift_query_range_exact(record, &starts, &starts)
70        } else {
71            lift_query_range(record, &starts, &ends)
72        };
73
74        // return object
75        Ranges {
76            starts: starts.into_iter().map(Some).collect(),
77            ends: ends.into_iter().map(Some).collect(),
78            lengths,
79            qual: vec![0; reference_starts.len()],
80            reference_starts,
81            reference_ends,
82            reference_lengths,
83            seq_len,
84            reverse: is_reverse,
85        }
86    }
87
88    pub fn set_qual(&mut self, qual: Vec<u8>) {
89        assert_eq!(qual.len(), self.starts.len());
90        self.qual = qual;
91        if self.reverse {
92            self.qual.reverse();
93        }
94    }
95
96    /// get positions on the complimented sequence in the cigar record
97    fn positions_on_aligned_sequence(input_positions: &mut [i64], is_reverse: bool, seq_len: i64) {
98        if !is_reverse {
99            return;
100        }
101        //need to correct for going from [) to (] if we are part of a range
102        for p in input_positions.iter_mut() {
103            *p = seq_len - *p - 1;
104        }
105        input_positions.reverse();
106    }
107
108    /// get the molecular coordinates of the ranges, taking into account
109    /// the alignment orientation
110    pub fn get_molecular(&self) -> Vec<Option<(i64, i64, i64)>> {
111        self.starts
112            .iter()
113            .zip(self.ends.iter())
114            .zip(self.lengths.iter())
115            .map(|((s, e), l)| {
116                if let (Some(s), Some(e), Some(l)) = (s, e, l) {
117                    Some((*s, *e, *l))
118                } else {
119                    None
120                }
121            })
122            .collect()
123    }
124
125    pub fn get_starts(&self) -> Vec<i64> {
126        self.starts.clone().into_iter().flatten().collect()
127    }
128
129    pub fn get_ends(&self) -> Vec<i64> {
130        self.ends.clone().into_iter().flatten().collect()
131    }
132
133    pub fn get_forward_starts(&self) -> Vec<i64> {
134        let mut z = self.get_starts();
135        Self::positions_on_aligned_sequence(&mut z, self.reverse, self.seq_len);
136        z
137    }
138
139    pub fn get_forward_quals(&self) -> Vec<u8> {
140        let mut forward = self.qual.clone();
141        if self.reverse {
142            forward.reverse();
143        }
144        forward
145    }
146
147    // filter out ranges that are less than the passed quality score
148    pub fn filter_by_qual(&mut self, min_qual: u8) {
149        let to_keep = self
150            .qual
151            .iter()
152            .enumerate()
153            .filter_map(|(i, &q)| if q >= min_qual { Some(i) } else { None })
154            .collect::<Vec<_>>();
155
156        self.starts = to_keep.iter().map(|&i| self.starts[i]).collect();
157        self.ends = to_keep.iter().map(|&i| self.ends[i]).collect();
158        self.lengths = to_keep.iter().map(|&i| self.lengths[i]).collect();
159        self.qual = to_keep.iter().map(|&i| self.qual[i]).collect();
160        self.reference_starts = to_keep.iter().map(|&i| self.reference_starts[i]).collect();
161        self.reference_ends = to_keep.iter().map(|&i| self.reference_ends[i]).collect();
162        self.reference_lengths = to_keep.iter().map(|&i| self.reference_lengths[i]).collect();
163    }
164
165    /// filter out ranges that are within the first or last X bp of the read
166    pub fn filter_starts_at_read_ends(&mut self, strip: i64) {
167        if strip == 0 {
168            return;
169        }
170        let to_keep = self
171            .starts
172            .iter()
173            .enumerate()
174            .filter_map(|(i, &s)| {
175                if let Some(s) = s {
176                    if s < strip || s > self.seq_len - strip {
177                        None
178                    } else {
179                        Some(i)
180                    }
181                } else {
182                    None
183                }
184            })
185            .collect::<Vec<_>>();
186
187        if to_keep.len() != self.starts.len() {
188            log::trace!(
189                "basemods stripped, {} basemods removed",
190                self.starts.len() - to_keep.len()
191            );
192        }
193
194        self.starts = to_keep.iter().map(|&i| self.starts[i]).collect();
195        self.ends = to_keep.iter().map(|&i| self.ends[i]).collect();
196        self.lengths = to_keep.iter().map(|&i| self.lengths[i]).collect();
197        self.qual = to_keep.iter().map(|&i| self.qual[i]).collect();
198        self.reference_starts = to_keep.iter().map(|&i| self.reference_starts[i]).collect();
199        self.reference_ends = to_keep.iter().map(|&i| self.reference_ends[i]).collect();
200        self.reference_lengths = to_keep.iter().map(|&i| self.reference_lengths[i]).collect();
201    }
202
203    pub fn to_strings(&self, reference: bool, skip_none: bool) -> Vec<String> {
204        let (s, e, l, q) = if reference {
205            (
206                &self.reference_starts,
207                &self.reference_ends,
208                &self.reference_lengths,
209                &self.qual,
210            )
211        } else {
212            (&self.starts, &self.ends, &self.lengths, &self.qual)
213        };
214
215        let s = crate::join_by_str_option_can_skip(s, ",", skip_none);
216        let e = crate::join_by_str_option_can_skip(e, ",", skip_none);
217        let l = crate::join_by_str_option_can_skip(l, ",", skip_none);
218        if reference {
219            vec![s, e, l]
220        } else {
221            let q = crate::join_by_str(q, ",");
222            vec![s, e, l, q]
223        }
224    }
225
226    /// get the reference coordinates of the ranges, taking into account
227    /// the alignment orientation
228    pub fn get_reference(&self) -> Vec<Option<(i64, i64, i64)>> {
229        self.reference_starts
230            .iter()
231            .zip(self.reference_ends.iter())
232            .zip(self.reference_lengths.iter())
233            .map(|((s, e), l)| {
234                if let (Some(s), Some(e), Some(l)) = (s, e, l) {
235                    Some((*s, *e, *l))
236                } else {
237                    None
238                }
239            })
240            .collect()
241    }
242
243    pub fn merge_ranges(multiple_ranges: Vec<&Self>) -> Self {
244        if multiple_ranges.is_empty() {
245            return Self {
246                starts: vec![],
247                ends: vec![],
248                lengths: vec![],
249                qual: vec![],
250                reference_starts: vec![],
251                reference_ends: vec![],
252                reference_lengths: vec![],
253                seq_len: 0,
254                reverse: false,
255            };
256        }
257        // check properties that must be the same
258        let reverse = multiple_ranges[0].reverse;
259        let seq_len = multiple_ranges[0].seq_len;
260        for r in multiple_ranges.iter() {
261            assert_eq!(r.reverse, reverse);
262            assert_eq!(r.seq_len, seq_len);
263        }
264        // get the other properties
265        let starts = multiple_ranges.iter().flat_map(|r| r.starts.clone());
266        let ends = multiple_ranges.iter().flat_map(|r| r.ends.clone());
267        let lengths = multiple_ranges.iter().flat_map(|r| r.lengths.clone());
268        let qual = multiple_ranges.iter().flat_map(|r| r.qual.clone());
269        let reference_starts = multiple_ranges
270            .iter()
271            .flat_map(|r| r.reference_starts.clone());
272        let reference_ends = multiple_ranges
273            .iter()
274            .flat_map(|r| r.reference_ends.clone());
275        let reference_lengths = multiple_ranges
276            .iter()
277            .flat_map(|r| r.reference_lengths.clone());
278
279        #[allow(clippy::type_complexity)]
280        let mut combo: Vec<(
281            Option<i64>,
282            Option<i64>,
283            Option<i64>,
284            u8,
285            Option<i64>,
286            Option<i64>,
287            Option<i64>,
288        )> = izip!(
289            starts,
290            ends,
291            lengths,
292            qual,
293            reference_starts,
294            reference_ends,
295            reference_lengths
296        )
297        .collect();
298        // sort by start position
299        combo.sort_by_key(|(s, _e, _l, _q, _r_s, _r_e, _r_l)| *s);
300        // unzip
301        let (starts, ends, lengths, qual, reference_starts, reference_ends, reference_lengths) =
302            multiunzip(combo);
303
304        Self {
305            starts,
306            ends,
307            lengths,
308            qual,
309            reference_starts,
310            reference_ends,
311            reference_lengths,
312            seq_len,
313            reverse,
314        }
315    }
316}
317
318impl<'a> IntoIterator for &'a Ranges {
319    type Item = (i64, i64, i64, u8, Option<(i64, i64, i64)>);
320    type IntoIter = RangesIterator<'a>;
321
322    fn into_iter(self) -> Self::IntoIter {
323        RangesIterator {
324            row: self,
325            index: 0,
326        }
327    }
328}
329
330pub struct RangesIterator<'a> {
331    row: &'a Ranges,
332    index: usize,
333}
334
335impl<'a> Iterator for RangesIterator<'a> {
336    type Item = (i64, i64, i64, u8, Option<(i64, i64, i64)>);
337    fn next(&mut self) -> Option<Self::Item> {
338        if self.index >= self.row.starts.len() {
339            return None;
340        }
341        let start = self.row.starts[self.index]?;
342        let end = self.row.ends[self.index]?;
343        let length = self.row.lengths[self.index]?;
344        let qual = self.row.qual[self.index];
345        let reference_start = self.row.reference_starts[self.index];
346        let reference_end = self.row.reference_ends[self.index];
347        let reference_length = self.row.reference_lengths[self.index];
348        let reference = match (reference_start, reference_end, reference_length) {
349            (Some(s), Some(e), Some(l)) => Some((s, e, l)),
350            _ => None,
351        };
352        self.index += 1;
353        Some((start, end, length, qual, reference))
354    }
355}