use std::collections::{BTreeMap, VecDeque};
use std::ops::Index;
use super::{BEDLike, GenomicRange};
use crate::intervaltree::{Interval, IterFind, IterLapper, Lapper};
#[derive(Debug, Clone)]
pub struct GIntervalMap<D>(BTreeMap<String, Lapper<u64, D>>);
impl<D> GIntervalMap<D> {
pub fn new() -> Self {
Self(BTreeMap::new())
}
pub fn len(&self) -> usize {
self.0.values().map(|x| x.len()).sum()
}
pub fn iter(&self) -> Iter<'_, D> {
Iter(
self.0
.iter()
.map(|(k, v)| (k, v.iter()))
.collect::<VecDeque<_>>(),
)
}
pub fn insert<B: BEDLike>(&mut self, bed: &B, value: D) {
let chr = bed.chrom();
let interval = Interval {
start: bed.start(),
stop: bed.end(),
val: value,
};
let tree = self
.0
.entry(chr.to_string())
.or_insert(Lapper::new(Vec::new()));
tree.insert(interval);
}
pub fn find<B: BEDLike>(&self, bed: &B) -> GIntervalQueryIter<'_, D> {
let chr = bed.chrom().to_string();
match self.0.get(&chr) {
None => GIntervalQueryIter {
chrom: chr,
query_iter: None,
},
Some(tree) => GIntervalQueryIter {
chrom: chr,
query_iter: Some(tree.find(bed.start(), bed.end())),
},
}
}
pub fn is_overlapped<B: BEDLike>(&self, bed: &B) -> bool {
self.find(bed).next().is_some()
}
}
impl<B: BEDLike, D> FromIterator<(B, D)> for GIntervalMap<D> {
fn from_iter<I: IntoIterator<Item = (B, D)>>(iter: I) -> Self {
let mut hmap: BTreeMap<String, Vec<_>> = BTreeMap::new();
for (bed, data) in iter {
let chr = bed.chrom();
let interval = Interval {
start: bed.start(),
stop: bed.end(),
val: data,
};
let vec = hmap.entry(chr.to_string()).or_insert(Vec::new());
vec.push(interval);
}
let hm = hmap
.into_iter()
.map(|(chr, vec)| (chr, Lapper::new(vec)))
.collect();
Self(hm)
}
}
#[derive(Debug)]
pub struct GIntervalQueryIter<'a, D> {
chrom: String,
query_iter: Option<IterFind<'a, u64, D>>,
}
impl<'a, D: 'a> Iterator for GIntervalQueryIter<'a, D> {
type Item = (GenomicRange, &'a D);
fn next(&mut self) -> Option<Self::Item> {
match self.query_iter {
None => return None,
Some(ref mut iter) => match iter.next() {
None => return None,
Some(interval) => {
let bed = GenomicRange::new(&self.chrom, interval.start, interval.stop);
Some((bed, &interval.val))
}
},
}
}
}
pub struct Iter<'a, D>(VecDeque<(&'a String, IterLapper<'a, u64, D>)>);
impl<'a, D> Iterator for Iter<'a, D> {
type Item = (GenomicRange, &'a D);
fn next(&mut self) -> Option<Self::Item> {
if self.0.is_empty() {
return None;
}
match self.0[0].1.next() {
None => {
self.0.pop_front();
self.next()
}
Some(interval) => {
let bed = GenomicRange::new(self.0[0].0, interval.start, interval.stop);
Some((bed, &interval.val))
}
}
}
}
#[derive(Debug, Clone)]
pub struct GIntervalIndexMap<D> {
data: Vec<D>,
indices: GIntervalMap<usize>,
}
impl<D> GIntervalIndexMap<D> {
pub fn new() -> Self {
Self {
data: Vec::new(),
indices: GIntervalMap::new(),
}
}
pub fn len(&self) -> usize {
self.data.len()
}
pub fn find<B: BEDLike>(&self, bed: &B) -> impl Iterator<Item = (GenomicRange, &D)> {
self.indices.find(bed).map(|(g, i)| (g, &self.data[*i]))
}
pub fn find_index_of<B: BEDLike>(&self, bed: &B) -> GIntervalQueryIter<'_, usize> {
self.indices.find(bed)
}
pub fn get(&self, index: usize) -> Option<&D> {
self.data.get(index)
}
}
impl<B: BEDLike, D> FromIterator<(B, D)> for GIntervalIndexMap<D> {
fn from_iter<I: IntoIterator<Item = (B, D)>>(iter: I) -> Self {
let mut data = Vec::new();
let indices = iter
.into_iter()
.enumerate()
.map(|(i, (bed, val))| {
data.push(val);
(bed, i)
})
.collect();
Self { data, indices }
}
}
#[derive(Debug, Clone)]
pub struct GIntervalIndexSet {
data: Vec<GenomicRange>,
indices: GIntervalMap<usize>,
}
impl Index<usize> for GIntervalIndexSet {
type Output = GenomicRange;
fn index(&self, index: usize) -> &Self::Output {
&self.data[index]
}
}
impl GIntervalIndexSet {
pub fn len(&self) -> usize {
self.data.len()
}
pub fn iter(&self) -> impl Iterator<Item = &GenomicRange> {
self.data.iter()
}
pub fn is_overlapped<B: BEDLike>(&self, bed: &B) -> bool {
self.find_full(bed).next().is_some()
}
pub fn find<B: BEDLike>(&self, bed: &B) -> impl Iterator<Item = GenomicRange> + '_ {
self.indices.find(bed).map(|x| x.0)
}
pub fn find_index_of<B: BEDLike>(&self, bed: &B) -> impl Iterator<Item = usize> + '_ {
self.indices.find(bed).map(|x| *x.1)
}
pub fn find_full<B: BEDLike>(&self, bed: &B) -> GIntervalQueryIter<'_, usize> {
self.indices.find(bed)
}
pub fn get(&self, index: usize) -> Option<&GenomicRange> {
self.data.get(index)
}
}
impl IntoIterator for GIntervalIndexSet {
type IntoIter = std::vec::IntoIter<Self::Item>;
type Item = GenomicRange;
fn into_iter(self) -> Self::IntoIter {
self.data.into_iter()
}
}
impl<B: BEDLike> FromIterator<B> for GIntervalIndexSet {
fn from_iter<I: IntoIterator<Item = B>>(iter: I) -> Self {
let mut data = Vec::new();
let indices = iter
.into_iter()
.enumerate()
.map(|(i, bed)| {
data.push(bed.to_genomic_range());
(bed, i)
})
.collect();
Self { data, indices }
}
}
#[cfg(test)]
mod bed_intersect_tests {
use super::*;
use itertools::Itertools;
#[test]
fn test_create() {
let beds: Vec<GenomicRange> = [
"chr1:200-500",
"chr1:200-500",
"chr1:1000-2000",
"chr1:1000-2000",
]
.into_iter()
.map(|x| x.parse().unwrap())
.collect();
let values = vec![2, 3, 5, 5];
let mut map: GIntervalMap<_> = beds.into_iter().zip(values.into_iter()).collect();
map.insert(&GenomicRange::new("chr1", 1000, 2000), 5);
map.insert(&GenomicRange::new("chr1", 1000, 2000), 6);
let result: Vec<_> = map.iter().sorted().collect();
let expected = vec![
(GenomicRange::new("chr1", 200, 500), &2),
(GenomicRange::new("chr1", 200, 500), &3),
(GenomicRange::new("chr1", 1000, 2000), &5),
(GenomicRange::new("chr1", 1000, 2000), &5),
(GenomicRange::new("chr1", 1000, 2000), &5),
(GenomicRange::new("chr1", 1000, 2000), &6),
];
assert_eq!(result, expected);
}
#[test]
fn test_intersect() {
let bed_set1: Vec<GenomicRange> = ["chr1:200-500", "chr1:200-500", "chr1:1000-2000"]
.into_iter()
.map(|x| x.parse().unwrap())
.collect();
let bed_set2: Vec<GenomicRange> = ["chr1:100-210", "chr1:100-200"]
.into_iter()
.map(|x| x.parse().unwrap())
.collect();
let tree: GIntervalIndexSet = bed_set1.clone().into_iter().collect();
assert_eq!(
tree.find(&bed_set2[0]).collect::<Vec<_>>(),
vec![
bed_set1.as_slice()[0].clone(),
bed_set1.as_slice()[0].clone()
]
);
let result: Vec<GenomicRange> = bed_set2
.into_iter()
.filter(|x| tree.is_overlapped(x))
.collect();
let expected = vec!["chr1:100-210".parse().unwrap()];
assert_eq!(result, expected);
}
#[test]
fn test_iter() {
fn random_bed() -> GenomicRange {
let chrom = format!("chr{}", rand::random::<u8>() % 10 + 1);
let start = rand::random::<u64>() % 1000;
let end = start + rand::random::<u64>() % 100;
GenomicRange::new(&chrom, start, end)
}
let beds: Vec<_> = (0..10000).map(|_| (random_bed(), ())).collect();
let map: GIntervalMap<()> = beds.clone().into_iter().collect();
assert_eq!(
map.iter().map(|(g, _)| g).collect::<Vec<_>>(),
beds.into_iter()
.map(|(g, _)| g)
.sorted()
.collect::<Vec<_>>(),
);
}
}