1use std::ops::Index;
2use std::collections::{BTreeMap, VecDeque};
3
4use super::{GenomicRange, BEDLike};
5use crate::intervaltree::{Interval, IterFind, IterLapper, Lapper};
6
7#[derive(Debug, Clone)]
11pub struct GIntervalMap<D>(BTreeMap<String, Lapper<u64, D>>);
12
13impl<D> GIntervalMap<D> {
14 pub fn new() -> Self {
16 Self(BTreeMap::new())
17 }
18
19 pub fn len(&self) -> usize {
21 self.0.values().map(|x| x.len()).sum()
22 }
23
24 pub fn iter(&self) -> Iter<'_, D> {
27 Iter(self.0.iter().map(|(k, v)| (k, v.iter())).collect::<VecDeque<_>>())
28 }
29
30 pub fn insert<B: BEDLike>(&mut self, bed: &B, value: D) {
32 let chr = bed.chrom();
33 let interval = Interval { start: bed.start(), stop: bed.end(), val: value };
34 let tree = self.0.entry(chr.to_string()).or_insert(Lapper::new(Vec::new()));
35 tree.insert(interval);
36 }
37
38 pub fn find<B: BEDLike>(&self, bed: &B) -> GIntervalQueryIter<'_, D> {
40 let chr = bed.chrom().to_string();
41 match self.0.get(&chr) {
42 None => GIntervalQueryIter { chrom: chr, query_iter: None },
43 Some(tree) => GIntervalQueryIter { chrom: chr, query_iter: Some(tree.find(bed.start(), bed.end())) }
44 }
45 }
46
47 pub fn is_overlapped<B: BEDLike>(&self, bed: &B) -> bool {
49 self.find(bed).next().is_some()
50 }
51}
52
53impl<B: BEDLike, D> FromIterator<(B, D)> for GIntervalMap<D> {
54 fn from_iter<I: IntoIterator<Item = (B, D)>>(iter: I) -> Self {
55 let mut hmap: BTreeMap<String, Vec<_>> = BTreeMap::new();
56 for (bed, data) in iter {
57 let chr = bed.chrom();
58 let interval = Interval { start: bed.start(), stop: bed.end(), val: data};
59 let vec = hmap.entry(chr.to_string()).or_insert(Vec::new());
60 vec.push(interval);
61 }
62 let hm = hmap.into_iter().map(|(chr, vec)| (chr, Lapper::new(vec))).collect();
63 Self(hm)
64 }
65}
66
67#[derive(Debug)]
70pub struct GIntervalQueryIter<'a, D> {
71 chrom: String,
72 query_iter: Option<IterFind<'a, u64, D>>,
73}
74
75impl<'a, D: 'a> Iterator for GIntervalQueryIter<'a, D> {
76 type Item = (GenomicRange, &'a D);
77
78 fn next(&mut self) -> Option<Self::Item> {
79 match self.query_iter {
80 None => return None,
81 Some(ref mut iter) => match iter.next() {
82 None => return None,
83 Some(interval) => {
84 let bed = GenomicRange::new(&self.chrom, interval.start, interval.stop);
85 Some((bed, &interval.val))
86 }
87 }
88 }
89 }
90}
91
92pub struct Iter<'a, D>(VecDeque<(&'a String, IterLapper<'a, u64, D>)>);
93
94impl<'a, D> Iterator for Iter<'a, D> {
95 type Item = (GenomicRange, &'a D);
96
97 fn next(&mut self) -> Option<Self::Item> {
98 if self.0.is_empty() {
99 return None
100 }
101 match self.0[0].1.next() {
102 None => {
103 self.0.pop_front();
104 self.next()
105 }
106 Some(interval) => {
107 let bed = GenomicRange::new(self.0[0].0, interval.start, interval.stop);
108 Some((bed, &interval.val))
109 }
110 }
111 }
112}
113
114#[derive(Debug, Clone)]
116pub struct GIntervalIndexMap<D> {
117 data: Vec<D>,
118 indices: GIntervalMap<usize>,
119}
120
121impl<D> GIntervalIndexMap<D> {
122 pub fn new() -> Self {
123 Self { data: Vec::new(), indices: GIntervalMap::new() }
124 }
125
126 pub fn len(&self) -> usize {
127 self.data.len()
128 }
129
130 pub fn find<B: BEDLike>(&self, bed: &B) -> impl Iterator<Item = (GenomicRange, &D)> {
132 self.indices.find(bed).map(|(g, i)| (g, &self.data[*i]))
133 }
134
135 pub fn find_index_of<B: BEDLike>(&self, bed: &B) -> GIntervalQueryIter<'_, usize> {
137 self.indices.find(bed)
138 }
139
140 pub fn get(&self, index: usize) -> Option<&D> {
141 self.data.get(index)
142 }
143}
144
145impl<B: BEDLike, D> FromIterator<(B, D)> for GIntervalIndexMap<D> {
146 fn from_iter<I: IntoIterator<Item = (B, D)>>(iter: I) -> Self {
147 let mut data = Vec::new();
148 let indices = iter.into_iter().enumerate().map(|(i, (bed, val))| {
149 data.push(val);
150 (bed, i)
151 }).collect();
152 Self { data, indices }
153 }
154}
155
156
157#[derive(Debug, Clone)]
159pub struct GIntervalIndexSet {
160 data: Vec<GenomicRange>,
161 indices: GIntervalMap<usize>,
162}
163
164impl Index<usize> for GIntervalIndexSet {
165 type Output = GenomicRange;
166
167 fn index(&self, index: usize) -> &Self::Output {
168 &self.data[index]
169 }
170}
171
172impl GIntervalIndexSet {
173 pub fn len(&self) -> usize {
174 self.data.len()
175 }
176
177 pub fn iter(&self) -> impl Iterator<Item = &GenomicRange> {
178 self.data.iter()
179 }
180
181 pub fn is_overlapped<B: BEDLike>(&self, bed: &B) -> bool {
182 self.find_full(bed).next().is_some()
183 }
184
185 pub fn find<B: BEDLike>(&self, bed: &B) -> impl Iterator<Item = GenomicRange> + '_ {
187 self.indices.find(bed).map(|x| x.0)
188 }
189
190 pub fn find_index_of<B: BEDLike>(&self, bed: &B) -> impl Iterator<Item = usize> + '_ {
192 self.indices.find(bed).map(|x| *x.1)
193 }
194
195 pub fn find_full<B: BEDLike>(&self, bed: &B) -> GIntervalQueryIter<'_, usize> {
196 self.indices.find(bed)
197 }
198
199 pub fn get(&self, index: usize) -> Option<&GenomicRange> {
200 self.data.get(index)
201 }
202}
203
204impl IntoIterator for GIntervalIndexSet {
205 type IntoIter = std::vec::IntoIter<Self::Item>;
206 type Item = GenomicRange;
207
208 fn into_iter(self) -> Self::IntoIter {
209 self.data.into_iter()
210 }
211}
212
213impl<B: BEDLike> FromIterator<B> for GIntervalIndexSet {
214 fn from_iter<I: IntoIterator<Item = B>>(iter: I) -> Self {
215 let mut data = Vec::new();
216 let indices = iter.into_iter().enumerate().map(|(i, bed)| {
217 data.push(bed.to_genomic_range());
218 (bed, i)
219 }).collect();
220 Self { data, indices }
221 }
222}
223
224#[cfg(test)]
225mod bed_intersect_tests {
226 use super::*;
227 use itertools::Itertools;
228
229 #[test]
230 fn test_create() {
231 let beds: Vec<GenomicRange> = [
232 "chr1:200-500",
233 "chr1:200-500",
234 "chr1:1000-2000",
235 "chr1:1000-2000",
236 ].into_iter().map(|x| x.parse().unwrap()).collect();
237 let values = vec![2, 3, 5, 5];
238 let mut map: GIntervalMap<_> = beds.into_iter().zip(values.into_iter()).collect();
239 map.insert(&GenomicRange::new("chr1", 1000, 2000), 5);
240 map.insert(&GenomicRange::new("chr1", 1000, 2000), 6);
241
242 let result: Vec<_> = map.iter().sorted().collect();
243 let expected = vec![
244 (GenomicRange::new("chr1", 200, 500), &2),
245 (GenomicRange::new("chr1", 200, 500), &3),
246 (GenomicRange::new("chr1", 1000, 2000), &5),
247 (GenomicRange::new("chr1", 1000, 2000), &5),
248 (GenomicRange::new("chr1", 1000, 2000), &5),
249 (GenomicRange::new("chr1", 1000, 2000), &6),
250 ];
251 assert_eq!(result, expected);
252 }
253
254 #[test]
255 fn test_intersect() {
256 let bed_set1: Vec<GenomicRange> = [
257 "chr1:200-500",
258 "chr1:200-500",
259 "chr1:1000-2000",
260 ].into_iter().map(|x| x.parse().unwrap()).collect();
261 let bed_set2: Vec<GenomicRange> = [
262 "chr1:100-210",
263 "chr1:100-200",
264 ].into_iter().map(|x| x.parse().unwrap()).collect();
265 let tree: GIntervalIndexSet = bed_set1.clone().into_iter().collect();
266
267 assert_eq!(
268 tree.find(&bed_set2[0]).collect::<Vec<_>>(),
269 vec![bed_set1.as_slice()[0].clone(), bed_set1.as_slice()[0].clone()]
270 );
271
272 let result: Vec<GenomicRange> = bed_set2.into_iter().filter(|x| tree.is_overlapped(x)).collect();
273 let expected = vec!["chr1:100-210".parse().unwrap()];
274 assert_eq!(result, expected);
275 }
276
277 #[test]
278 fn test_iter() {
279 fn random_bed() -> GenomicRange {
280 let chrom = format!("chr{}", rand::random::<u8>() % 10 + 1);
281 let start = rand::random::<u64>() % 1000;
282 let end = start + rand::random::<u64>() % 100;
283 GenomicRange::new(&chrom, start, end)
284 }
285 let beds: Vec<_> = (0..10000).map(|_| (random_bed(), ())).collect();
286 let map: GIntervalMap<()> = beds.clone().into_iter().collect();
287 assert_eq!(
288 map.iter().map(|(g, _)| g).collect::<Vec<_>>(),
289 beds.into_iter().map(|(g, _)| g).sorted().collect::<Vec<_>>(),
290 );
291 }
292}