1use std::collections::{BTreeMap, VecDeque};
2use std::ops::Index;
3
4use super::{BEDLike, GenomicRange};
5use crate::intervaltree::{Interval, IterFind, IterLapper, Lapper};
6
7#[derive(Debug, Clone)]
11pub struct GIntervalMap<D>(BTreeMap<String, Lapper<u64, D>>);
12
13impl<D> GIntervalMap<D> {
14 pub fn new() -> Self {
16 Self(BTreeMap::new())
17 }
18
19 pub fn len(&self) -> usize {
21 self.0.values().map(|x| x.len()).sum()
22 }
23
24 pub fn iter(&self) -> Iter<'_, D> {
27 Iter(
28 self.0
29 .iter()
30 .map(|(k, v)| (k, v.iter()))
31 .collect::<VecDeque<_>>(),
32 )
33 }
34
35 pub fn insert<B: BEDLike>(&mut self, bed: &B, value: D) {
37 let chr = bed.chrom();
38 let interval = Interval {
39 start: bed.start(),
40 stop: bed.end(),
41 val: value,
42 };
43 let tree = self
44 .0
45 .entry(chr.to_string())
46 .or_insert(Lapper::new(Vec::new()));
47 tree.insert(interval);
48 }
49
50 pub fn find<B: BEDLike>(&self, bed: &B) -> GIntervalQueryIter<'_, D> {
52 let chr = bed.chrom().to_string();
53 match self.0.get(&chr) {
54 None => GIntervalQueryIter {
55 chrom: chr,
56 query_iter: None,
57 },
58 Some(tree) => GIntervalQueryIter {
59 chrom: chr,
60 query_iter: Some(tree.find(bed.start(), bed.end())),
61 },
62 }
63 }
64
65 pub fn is_overlapped<B: BEDLike>(&self, bed: &B) -> bool {
67 self.find(bed).next().is_some()
68 }
69}
70
71impl<B: BEDLike, D> FromIterator<(B, D)> for GIntervalMap<D> {
72 fn from_iter<I: IntoIterator<Item = (B, D)>>(iter: I) -> Self {
73 let mut hmap: BTreeMap<String, Vec<_>> = BTreeMap::new();
74 for (bed, data) in iter {
75 let chr = bed.chrom();
76 let interval = Interval {
77 start: bed.start(),
78 stop: bed.end(),
79 val: data,
80 };
81 let vec = hmap.entry(chr.to_string()).or_insert(Vec::new());
82 vec.push(interval);
83 }
84 let hm = hmap
85 .into_iter()
86 .map(|(chr, vec)| (chr, Lapper::new(vec)))
87 .collect();
88 Self(hm)
89 }
90}
91
92#[derive(Debug)]
95pub struct GIntervalQueryIter<'a, D> {
96 chrom: String,
97 query_iter: Option<IterFind<'a, u64, D>>,
98}
99
100impl<'a, D: 'a> Iterator for GIntervalQueryIter<'a, D> {
101 type Item = (GenomicRange, &'a D);
102
103 fn next(&mut self) -> Option<Self::Item> {
104 match self.query_iter {
105 None => return None,
106 Some(ref mut iter) => match iter.next() {
107 None => return None,
108 Some(interval) => {
109 let bed = GenomicRange::new(&self.chrom, interval.start, interval.stop);
110 Some((bed, &interval.val))
111 }
112 },
113 }
114 }
115}
116
117pub struct Iter<'a, D>(VecDeque<(&'a String, IterLapper<'a, u64, D>)>);
118
119impl<'a, D> Iterator for Iter<'a, D> {
120 type Item = (GenomicRange, &'a D);
121
122 fn next(&mut self) -> Option<Self::Item> {
123 if self.0.is_empty() {
124 return None;
125 }
126 match self.0[0].1.next() {
127 None => {
128 self.0.pop_front();
129 self.next()
130 }
131 Some(interval) => {
132 let bed = GenomicRange::new(self.0[0].0, interval.start, interval.stop);
133 Some((bed, &interval.val))
134 }
135 }
136 }
137}
138
139#[derive(Debug, Clone)]
141pub struct GIntervalIndexMap<D> {
142 data: Vec<D>,
143 indices: GIntervalMap<usize>,
144}
145
146impl<D> GIntervalIndexMap<D> {
147 pub fn new() -> Self {
148 Self {
149 data: Vec::new(),
150 indices: GIntervalMap::new(),
151 }
152 }
153
154 pub fn len(&self) -> usize {
155 self.data.len()
156 }
157
158 pub fn find<B: BEDLike>(&self, bed: &B) -> impl Iterator<Item = (GenomicRange, &D)> {
160 self.indices.find(bed).map(|(g, i)| (g, &self.data[*i]))
161 }
162
163 pub fn find_index_of<B: BEDLike>(&self, bed: &B) -> GIntervalQueryIter<'_, usize> {
165 self.indices.find(bed)
166 }
167
168 pub fn get(&self, index: usize) -> Option<&D> {
169 self.data.get(index)
170 }
171}
172
173impl<B: BEDLike, D> FromIterator<(B, D)> for GIntervalIndexMap<D> {
174 fn from_iter<I: IntoIterator<Item = (B, D)>>(iter: I) -> Self {
175 let mut data = Vec::new();
176 let indices = iter
177 .into_iter()
178 .enumerate()
179 .map(|(i, (bed, val))| {
180 data.push(val);
181 (bed, i)
182 })
183 .collect();
184 Self { data, indices }
185 }
186}
187
188#[derive(Debug, Clone)]
190pub struct GIntervalIndexSet {
191 data: Vec<GenomicRange>,
192 indices: GIntervalMap<usize>,
193}
194
195impl Index<usize> for GIntervalIndexSet {
196 type Output = GenomicRange;
197
198 fn index(&self, index: usize) -> &Self::Output {
199 &self.data[index]
200 }
201}
202
203impl GIntervalIndexSet {
204 pub fn len(&self) -> usize {
205 self.data.len()
206 }
207
208 pub fn iter(&self) -> impl Iterator<Item = &GenomicRange> {
209 self.data.iter()
210 }
211
212 pub fn is_overlapped<B: BEDLike>(&self, bed: &B) -> bool {
213 self.find_full(bed).next().is_some()
214 }
215
216 pub fn find<B: BEDLike>(&self, bed: &B) -> impl Iterator<Item = GenomicRange> + '_ {
218 self.indices.find(bed).map(|x| x.0)
219 }
220
221 pub fn find_index_of<B: BEDLike>(&self, bed: &B) -> impl Iterator<Item = usize> + '_ {
223 self.indices.find(bed).map(|x| *x.1)
224 }
225
226 pub fn find_full<B: BEDLike>(&self, bed: &B) -> GIntervalQueryIter<'_, usize> {
227 self.indices.find(bed)
228 }
229
230 pub fn get(&self, index: usize) -> Option<&GenomicRange> {
231 self.data.get(index)
232 }
233}
234
235impl IntoIterator for GIntervalIndexSet {
236 type IntoIter = std::vec::IntoIter<Self::Item>;
237 type Item = GenomicRange;
238
239 fn into_iter(self) -> Self::IntoIter {
240 self.data.into_iter()
241 }
242}
243
244impl<B: BEDLike> FromIterator<B> for GIntervalIndexSet {
245 fn from_iter<I: IntoIterator<Item = B>>(iter: I) -> Self {
246 let mut data = Vec::new();
247 let indices = iter
248 .into_iter()
249 .enumerate()
250 .map(|(i, bed)| {
251 data.push(bed.to_genomic_range());
252 (bed, i)
253 })
254 .collect();
255 Self { data, indices }
256 }
257}
258
259#[cfg(test)]
260mod bed_intersect_tests {
261 use super::*;
262 use itertools::Itertools;
263
264 #[test]
265 fn test_create() {
266 let beds: Vec<GenomicRange> = [
267 "chr1:200-500",
268 "chr1:200-500",
269 "chr1:1000-2000",
270 "chr1:1000-2000",
271 ]
272 .into_iter()
273 .map(|x| x.parse().unwrap())
274 .collect();
275 let values = vec![2, 3, 5, 5];
276 let mut map: GIntervalMap<_> = beds.into_iter().zip(values.into_iter()).collect();
277 map.insert(&GenomicRange::new("chr1", 1000, 2000), 5);
278 map.insert(&GenomicRange::new("chr1", 1000, 2000), 6);
279
280 let result: Vec<_> = map.iter().sorted().collect();
281 let expected = vec![
282 (GenomicRange::new("chr1", 200, 500), &2),
283 (GenomicRange::new("chr1", 200, 500), &3),
284 (GenomicRange::new("chr1", 1000, 2000), &5),
285 (GenomicRange::new("chr1", 1000, 2000), &5),
286 (GenomicRange::new("chr1", 1000, 2000), &5),
287 (GenomicRange::new("chr1", 1000, 2000), &6),
288 ];
289 assert_eq!(result, expected);
290 }
291
292 #[test]
293 fn test_intersect() {
294 let bed_set1: Vec<GenomicRange> = ["chr1:200-500", "chr1:200-500", "chr1:1000-2000"]
295 .into_iter()
296 .map(|x| x.parse().unwrap())
297 .collect();
298 let bed_set2: Vec<GenomicRange> = ["chr1:100-210", "chr1:100-200"]
299 .into_iter()
300 .map(|x| x.parse().unwrap())
301 .collect();
302 let tree: GIntervalIndexSet = bed_set1.clone().into_iter().collect();
303
304 assert_eq!(
305 tree.find(&bed_set2[0]).collect::<Vec<_>>(),
306 vec![
307 bed_set1.as_slice()[0].clone(),
308 bed_set1.as_slice()[0].clone()
309 ]
310 );
311
312 let result: Vec<GenomicRange> = bed_set2
313 .into_iter()
314 .filter(|x| tree.is_overlapped(x))
315 .collect();
316 let expected = vec!["chr1:100-210".parse().unwrap()];
317 assert_eq!(result, expected);
318 }
319
320 #[test]
321 fn test_iter() {
322 fn random_bed() -> GenomicRange {
323 let chrom = format!("chr{}", rand::random::<u8>() % 10 + 1);
324 let start = rand::random::<u64>() % 1000;
325 let end = start + rand::random::<u64>() % 100;
326 GenomicRange::new(&chrom, start, end)
327 }
328 let beds: Vec<_> = (0..10000).map(|_| (random_bed(), ())).collect();
329 let map: GIntervalMap<()> = beds.clone().into_iter().collect();
330 assert_eq!(
331 map.iter().map(|(g, _)| g).collect::<Vec<_>>(),
332 beds.into_iter()
333 .map(|(g, _)| g)
334 .sorted()
335 .collect::<Vec<_>>(),
336 );
337 }
338}