granges/
join.rs

1//! [`LeftGroupedJoin`], [`JoinData`], and [`JoinDataIterator`] types for overlaps.
2//!
3#![allow(clippy::all)]
4
5use std::collections::HashSet;
6
7use crate::{
8    traits::{GenericRange, IndexedDataContainer, JoinDataOperations},
9    Position,
10};
11
12/// This is a generic range used just in join logic, to avoid
13/// having to handle bringing range types into [`LeftGroupedJoin`],
14/// which clog up the API a bit.
15/// These can represent indexed and empty ranges.
16#[derive(Clone, Debug, PartialEq)]
17pub struct RangeTuple((Position, Position, Option<usize>));
18
19impl GenericRange for RangeTuple {
20    fn start(&self) -> Position {
21        self.0 .0
22    }
23    fn end(&self) -> Position {
24        self.0 .1
25    }
26    fn index(&self) -> Option<usize> {
27        self.0 .2
28    }
29}
30
31/// This is a special "reduced" range that stores indices
32/// to multiple data elements.
33#[derive(Clone, Debug, PartialEq)]
34pub struct RangeReduced((Position, Position, HashSet<Option<usize>>));
35
36impl RangeReduced {
37    pub fn indices(&self) -> &HashSet<Option<usize>> {
38        &self.0 .2
39    }
40}
41
42/// Create a vector of "reduced" or flattened ranges, that stack the indices of each range.
43///
44///
45/// This first finds every range position, sorts, and dedups these. Then,
46/// it iterates through each range of adjacent positions. Each of these
47/// new ranges is guaranteed to be covered by >= 1 range.
48pub fn reduce_ranges<R: GenericRange>(ranges: &Vec<R>) -> Vec<RangeReduced> {
49    let mut range_ends: Vec<Position> = ranges
50        .iter()
51        .flat_map(|range| vec![range.start(), range.end()].into_iter())
52        .collect();
53    range_ends.sort_unstable();
54    range_ends.dedup();
55
56    let mut ranges_reduced = Vec::new();
57    for range in range_ends.windows(2) {
58        let mut indices = HashSet::new();
59        if let [start, end] = range {
60            for range in ranges {
61                if range.start() < *end && range.end() > *start {
62                    indices.insert(range.index());
63                }
64            }
65            if !indices.is_empty() {
66                ranges_reduced.push(RangeReduced((*start, *end, indices)));
67            }
68        }
69    }
70    ranges_reduced
71}
72
73impl GenericRange for RangeReduced {
74    fn start(&self) -> Position {
75        self.0 .0
76    }
77    fn end(&self) -> Position {
78        self.0 .1
79    }
80    // Note: [`RangeReduced`] do not have valid indices,
81    // so this returns [`None`]. (They do have a [`Vec<Option<usize>>`]
82    // of indices, but this has to be accessed through the type.)
83    fn index(&self) -> Option<usize> {
84        None
85    }
86}
87
88/// [`LeftGroupedJoin`] contains information about the right ranges
89/// and their degree of overlap with a focal left range. This information
90/// is designed to facilitate downstream statistical sumamries of the
91/// corresponding data in overlapping ranges.
92#[derive(Clone, Debug, PartialEq)]
93pub struct LeftGroupedJoin {
94    /// The left range.
95    pub left: RangeTuple,
96
97    /// A `Vec` of the right overlapping ranges (unsorted).
98    // NOTE: previously lengths, overlap width, and their data
99    // indices were stored. For just one extra u32 we can store
100    // all the data in the original structure.
101    pub rights: Vec<RangeTuple>,
102}
103
104impl LeftGroupedJoin {
105    /// Create a new [`LeftGroupedJoin`].
106    pub fn new<R: GenericRange>(left_range: &R) -> Self {
107        Self {
108            left: RangeTuple(left_range.as_tuple()),
109            rights: Vec::new(),
110        }
111    }
112    /// Add a right (overlapping) range to this [`LeftGroupedJoin`].
113    ///
114    // Note: in principle, this can be called on *non-overlapping* right ranges too,
115    // for a full-outer join.
116    pub fn add_right<R: GenericRange>(&mut self, right: &R) {
117        self.rights.push(RangeTuple(right.as_tuple()))
118    }
119
120    /// Sort the right ranges, for faster downstream processing.
121    pub fn sort_ranges(&mut self) {
122        self.rights.sort_by(|a, b| {
123            a.start()
124                .cmp(&b.start())
125                .then_with(|| a.end().cmp(&b.end()))
126                .then_with(|| a.index().cmp(&b.index()))
127        });
128    }
129
130    /// "Reduce" the ranges into a minimum set, with all
131    /// their indices gathered in a [`Vec<Option<usize>>`].
132    /// This returns a [`Vec<RangeReduced>`].
133    pub fn reduce_ranges(&mut self) -> Vec<RangeReduced> {
134        // we need to trim these by the left range to get the
135        // proper overlaps within this left range
136        let rights: Vec<_> = self
137            .rights
138            .iter()
139            .map(|range| {
140                let (start, end) = range.overlap_range(&self.left).unwrap();
141                RangeTuple((start, end, range.index()))
142            })
143            .collect();
144        reduce_ranges(&rights)
145    }
146
147    /// Return whether this left range has any [`LeftGroupedJoin`].
148    pub fn has_overlaps(&self) -> bool {
149        !self.overlaps().is_empty()
150    }
151
152    /// Retrieve the number of right overlaps.
153    pub fn num_overlaps(&self) -> usize {
154        self.overlaps().len()
155    }
156
157    /// Retrieve the right overlaps.
158    pub fn overlaps(&self) -> Vec<Position> {
159        self.rights
160            .iter()
161            .map(|r| r.overlap_width(&self.left))
162            .collect()
163    }
164
165    /// Get the left index.
166    pub fn left_index(&self) -> Option<usize> {
167        self.left.index()
168    }
169
170    /// Get the right indices.
171    pub fn right_indices(&self) -> Vec<Option<usize>> {
172        self.rights.iter().map(|r| r.index()).collect()
173    }
174}
175
176/// [`JoinData`] contains a [`Vec<LeftGroupedJoin>`] of all overlap joins,
177/// and owns the left data container from the join. It stores a reference
178/// to the right data container.
179#[derive(Clone, Debug)]
180pub struct JoinData<'a, DL, DR> {
181    pub joins: Vec<LeftGroupedJoin>,
182    pub left_data: DL,
183    pub right_data: &'a DR,
184}
185
186impl<'a, DL, DR> JoinData<'a, DL, DR> {
187    /// Create a new [`JoinData`].
188    pub fn new(left_data: DL, right_data: &'a DR) -> Self {
189        let joins = Vec::new();
190        JoinData {
191            joins,
192            left_data,
193            right_data,
194        }
195    }
196
197    /// Push the [`LeftGroupedJoin`] to joins.
198    pub fn push(&mut self, join: LeftGroupedJoin) {
199        self.joins.push(join)
200    }
201
202    /// Get the total number of joins.
203    pub fn len(&self) -> usize {
204        self.joins.len()
205    }
206
207    /// Return whether the [`JoinData`] object is empty (contains no ranges).
208    pub fn is_empty(&self) -> bool {
209        self.len() == 0
210    }
211
212    /// Create an iterator over the joins.
213    pub fn iter(&'a self) -> JoinDataIterator<'a, DL, DR> {
214        JoinDataIterator {
215            inner: self.joins.iter(),
216            left_data: Some(&self.left_data),
217            right_data: Some(self.right_data),
218        }
219    }
220}
221
222pub struct CombinedJoinData<DL, DR> {
223    pub join: LeftGroupedJoin, // Information on the join
224    pub left_data: DL,         // The left data element
225    pub right_data: Vec<DR>,   // The right data elements
226}
227
228impl<DL, DR> JoinDataOperations<DL, DR> for CombinedJoinData<DL, DR> {
229    type LeftDataElementType = DL;
230    type RightDataElementType = DR;
231    fn join(&self) -> &LeftGroupedJoin {
232        &self.join
233    }
234    fn left_data(&self) -> Option<&Self::LeftDataElementType> {
235        Some(&self.left_data)
236    }
237    fn right_data(&self) -> Option<&Vec<Self::RightDataElementType>> {
238        Some(&self.right_data)
239    }
240}
241
242impl<'a, DL, DR> JoinData<'a, DL, DR>
243where
244    DL: IndexedDataContainer + 'a,
245    DR: IndexedDataContainer + 'a,
246{
247    /// Apply `func` to each element, putting the results into the returned
248    /// `Vec<U>`.
249    pub fn map<F, V>(self, func: F) -> Vec<V>
250    where
251        F: Fn(
252            CombinedJoinData<
253                <DL as IndexedDataContainer>::OwnedItem,
254                <DR as IndexedDataContainer>::OwnedItem,
255            >,
256        ) -> V,
257    {
258        self.joins
259            .into_iter()
260            .map(|join| {
261                let left_data = self.left_data.get_owned(join.left_index().unwrap());
262                let right_data = join
263                    .right_indices()
264                    .iter()
265                    .map(|idx| self.right_data.get_owned(idx.unwrap()))
266                    .collect();
267
268                func(CombinedJoinData {
269                    join,
270                    left_data,
271                    right_data,
272                })
273            })
274            .collect()
275    }
276}
277
278/// [`JoinDataLeftEmpty`] contains a [`Vec<LeftGroupedJoin>`] of all overlap joins,
279/// and stores a reference to the right data container.
280#[derive(Clone, Debug)]
281pub struct JoinDataLeftEmpty<'a, DR> {
282    pub joins: Vec<LeftGroupedJoin>,
283    pub right_data: &'a DR,
284}
285
286impl<'a, DR> JoinDataLeftEmpty<'a, DR> {
287    /// Create a new [`JoinData`].
288    pub fn new(right_data: &'a DR) -> Self {
289        let joins = Vec::new();
290        JoinDataLeftEmpty { joins, right_data }
291    }
292
293    /// Push the [`LeftGroupedJoin`] to joins.
294    pub fn push(&mut self, join: LeftGroupedJoin) {
295        self.joins.push(join)
296    }
297
298    /// Get the total number of joins.
299    pub fn len(&self) -> usize {
300        self.joins.len()
301    }
302
303    /// Return whether the [`JoinData`] object is empty (contains no ranges).
304    pub fn is_empty(&self) -> bool {
305        self.len() == 0
306    }
307
308    /// Create an iterator over the joins.
309    pub fn iter(&'a self) -> JoinDataIterator<'a, (), DR> {
310        JoinDataIterator {
311            inner: self.joins.iter(),
312            left_data: None,
313            right_data: Some(self.right_data),
314        }
315    }
316}
317
318pub struct CombinedJoinDataLeftEmpty<DR> {
319    pub join: LeftGroupedJoin, // Information on the join
320    pub right_data: Vec<DR>,   // The right data element
321}
322
323impl<DR> JoinDataOperations<(), DR> for CombinedJoinDataLeftEmpty<DR> {
324    type LeftDataElementType = ();
325    type RightDataElementType = DR;
326    fn join(&self) -> &LeftGroupedJoin {
327        &self.join
328    }
329    fn left_data(&self) -> Option<&Self::LeftDataElementType> {
330        None
331    }
332    fn right_data(&self) -> Option<&Vec<Self::RightDataElementType>> {
333        Some(&self.right_data)
334    }
335}
336
337impl<'a, DR> JoinDataLeftEmpty<'a, DR>
338where
339    DR: IndexedDataContainer + 'a,
340{
341    /// Apply `func` to each element, putting the results into the returned
342    /// `Vec<U>`.
343    pub fn map<F, V>(self, func: F) -> Vec<V>
344    where
345        F: Fn(CombinedJoinDataLeftEmpty<<DR as IndexedDataContainer>::OwnedItem>) -> V,
346    {
347        // TODO/OPTIMIZE: would consuming here (and analagous funcs) be better/faster?
348        // Would require a bit of a redesign.
349        self.joins
350            .into_iter()
351            .map(|join| {
352                let right_indices = join.right_indices();
353                let right_data = right_indices
354                    .iter()
355                    .map(|idx| self.right_data.get_owned(idx.unwrap()))
356                    .collect();
357
358                func(CombinedJoinDataLeftEmpty { join, right_data })
359            })
360            .collect()
361    }
362}
363
364/// [`JoinDataRightEmpty`] contains a [`Vec<LeftGroupedJoin>`] of all overlap joins,
365/// and owns the left data.
366#[derive(Clone, Debug)]
367pub struct JoinDataRightEmpty<DR> {
368    pub joins: Vec<LeftGroupedJoin>,
369    pub left_data: DR,
370}
371
372impl<'a, DL> JoinDataRightEmpty<DL> {
373    /// Create a new [`JoinData`].
374    pub fn new(left_data: DL) -> Self {
375        let joins = Vec::new();
376        JoinDataRightEmpty { joins, left_data }
377    }
378
379    /// Push the [`LeftGroupedJoin`] to joins.
380    pub fn push(&mut self, join: LeftGroupedJoin) {
381        self.joins.push(join)
382    }
383
384    /// Get the total number of joins.
385    pub fn len(&self) -> usize {
386        self.joins.len()
387    }
388
389    /// Return whether the [`JoinData`] object is empty (contains no ranges).
390    pub fn is_empty(&self) -> bool {
391        self.len() == 0
392    }
393
394    /// Create an iterator over the joins.
395    pub fn iter(&'a self) -> JoinDataIterator<'a, DL, ()> {
396        JoinDataIterator {
397            inner: self.joins.iter(),
398            left_data: Some(&self.left_data),
399            right_data: None,
400        }
401    }
402}
403
404pub struct CombinedJoinDataRightEmpty<DL> {
405    pub join: LeftGroupedJoin, // Information on the join
406    pub left_data: DL,         // The right data element
407}
408
409impl<DL> JoinDataOperations<DL, ()> for CombinedJoinDataRightEmpty<DL> {
410    type LeftDataElementType = DL;
411    type RightDataElementType = ();
412    fn join(&self) -> &LeftGroupedJoin {
413        &self.join
414    }
415    fn left_data(&self) -> Option<&Self::LeftDataElementType> {
416        Some(&self.left_data)
417    }
418    fn right_data(&self) -> Option<&Vec<Self::RightDataElementType>> {
419        None
420    }
421}
422
423impl<'a, DL> JoinDataRightEmpty<DL>
424where
425    DL: IndexedDataContainer,
426{
427    /// Apply `func` to each element, putting the results into the returned
428    /// `Vec<U>`.
429    pub fn map<F, V>(self, func: F) -> Vec<V>
430    where
431        F: Fn(CombinedJoinDataRightEmpty<<DL as IndexedDataContainer>::OwnedItem>) -> V,
432    {
433        self.joins
434            .into_iter()
435            .map(|join| {
436                let left_data = self.left_data.get_owned(join.left_index().unwrap());
437
438                func(CombinedJoinDataRightEmpty { join, left_data })
439            })
440            .collect()
441    }
442}
443
444/// [`JoinDataBothEmpty`] contains a [`Vec<LeftGroupedJoin>`] of all overlap joins
445/// without any owned or references to data containers.
446#[derive(Clone, Debug)]
447pub struct JoinDataBothEmpty {
448    pub joins: Vec<LeftGroupedJoin>,
449}
450
451impl JoinDataBothEmpty {
452    /// Create a new [`JoinData`].
453    pub fn new() -> Self {
454        let joins = Vec::new();
455        JoinDataBothEmpty { joins }
456    }
457
458    /// Push the [`LeftGroupedJoin`] to joins.
459    pub fn push(&mut self, join: LeftGroupedJoin) {
460        self.joins.push(join)
461    }
462
463    /// Get the total number of joins.
464    pub fn len(&self) -> usize {
465        self.joins.len()
466    }
467
468    /// Return whether the [`JoinData`] object is empty (contains no ranges).
469    pub fn is_empty(&self) -> bool {
470        self.len() == 0
471    }
472
473    /// Create an iterator over the joins.
474    pub fn iter(&self) -> JoinDataIterator<'_, (), ()> {
475        JoinDataIterator {
476            inner: self.joins.iter(),
477            left_data: None,
478            right_data: None,
479        }
480    }
481}
482
483pub struct CombinedJoinDataBothEmpty {
484    pub join: LeftGroupedJoin,
485}
486
487impl JoinDataOperations<(), ()> for CombinedJoinDataBothEmpty {
488    type LeftDataElementType = ();
489    type RightDataElementType = ();
490    fn join(&self) -> &LeftGroupedJoin {
491        &self.join
492    }
493    fn left_data(&self) -> Option<&Self::LeftDataElementType> {
494        None
495    }
496    fn right_data(&self) -> Option<&Vec<Self::RightDataElementType>> {
497        None
498    }
499}
500
501impl JoinDataBothEmpty {
502    /// Apply `func` to each element, putting the results into the returned
503    /// `Vec<U>`.
504    pub fn map<F, V>(self, func: F) -> Vec<V>
505    where
506        F: Fn(CombinedJoinDataBothEmpty) -> V,
507    {
508        self.joins
509            .into_iter()
510            .map(|join| func(CombinedJoinDataBothEmpty { join }))
511            .collect()
512    }
513}
514
515/// An iterator over the [`LeftGroupedJoin`] types that represent
516/// information about overlaps right ranges have with a particular left range.
517///
518/// This also contains references to the left and right data containers, for
519/// better ergonomics in downstream data processing.
520pub struct JoinDataIterator<'a, DL, DR> {
521    inner: std::slice::Iter<'a, LeftGroupedJoin>,
522    pub left_data: Option<&'a DL>,
523    pub right_data: Option<&'a DR>,
524}
525
526impl<'a, DL, DR> Iterator for JoinDataIterator<'a, DL, DR> {
527    type Item = &'a LeftGroupedJoin;
528
529    fn next(&mut self) -> Option<Self::Item> {
530        self.inner.next()
531    }
532}
533
534#[cfg(test)]
535mod tests {
536    use super::*;
537    use crate::ranges::RangeIndexed;
538
539    #[test]
540    fn test_join_data_new() {
541        let left_data = vec![1, 2];
542        let right_data = vec![4, 8];
543        let mut jd = JoinData::new(left_data, &right_data);
544        assert_eq!(jd.len(), 0);
545
546        let left = RangeIndexed::new(0, 10, 1);
547        let mut join = LeftGroupedJoin::new(&left);
548        let right = RangeIndexed::new(8, 10, 1);
549        join.add_right(&right);
550        jd.push(join);
551        assert_eq!(jd.len(), 1);
552    }
553
554    #[test]
555    fn test_single_range_indexed() {
556        let ranges = vec![RangeIndexed {
557            start: 1,
558            end: 5,
559            index: 0,
560        }];
561        let reduced = reduce_ranges(&ranges);
562
563        assert_eq!(reduced.len(), 1);
564        assert_eq!(reduced[0].0 .0, 1);
565        assert_eq!(reduced[0].0 .1, 5);
566        assert!(reduced[0].0 .2.contains(&Some(0)));
567    }
568
569    #[test]
570    fn test_overlapping_ranges_indexed() {
571        let ranges = vec![
572            RangeIndexed {
573                start: 1,
574                end: 4,
575                index: 0,
576            },
577            RangeIndexed {
578                start: 3,
579                end: 6,
580                index: 1,
581            },
582        ];
583        let reduced = reduce_ranges(&ranges);
584
585        assert_eq!(reduced.len(), 3);
586
587        let mut iter = reduced.iter();
588        let first_range = iter.next().unwrap();
589        assert_eq!(first_range.start(), 1);
590        assert_eq!(first_range.end(), 3);
591        assert_eq!(first_range.indices().len(), 1);
592        assert!(first_range.indices().contains(&Some(0)));
593
594        let second_range = iter.next().unwrap();
595        assert_eq!(second_range.start(), 3);
596        assert_eq!(second_range.end(), 4);
597        assert_eq!(second_range.indices().len(), 2);
598        assert!(second_range.indices().contains(&Some(0)));
599        assert!(second_range.indices().contains(&Some(1)));
600
601        let third_range = iter.next().unwrap();
602        assert_eq!(third_range.start(), 4);
603        assert_eq!(third_range.end(), 6);
604        assert_eq!(third_range.indices().len(), 1);
605        assert!(third_range.indices().contains(&Some(1)));
606    }
607
608    #[test]
609    fn test_non_overlapping_ranges_indexed() {
610        let ranges = vec![
611            RangeIndexed {
612                start: 1,
613                end: 3,
614                index: 0,
615            },
616            RangeIndexed {
617                start: 4,
618                end: 6,
619                index: 1,
620            },
621        ];
622        let reduced = reduce_ranges(&ranges);
623
624        assert_eq!(reduced.len(), 2);
625
626        let mut iter = reduced.iter();
627        let first_range = iter.next().unwrap();
628        assert_eq!(first_range.start(), 1);
629        assert_eq!(first_range.end(), 3);
630        assert_eq!(first_range.indices().len(), 1);
631        assert!(first_range.indices().contains(&Some(0)));
632
633        let second_range = iter.next().unwrap();
634        assert_eq!(second_range.start(), 4);
635        assert_eq!(second_range.end(), 6);
636        assert_eq!(second_range.indices().len(), 1);
637        assert!(second_range.indices().contains(&Some(1)));
638    }
639}