use crate::core::GenomicRecordIterator;
use crate::error::Result;
use crate::formats::bed::{BedReader, BedRecord};
use crate::interval::GenomicInterval;
use std::collections::HashMap;
use std::path::Path;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Annotation {
pub interval: GenomicInterval,
pub data: String,
}
#[derive(Debug, Clone)]
pub struct IntervalTreeNode {
center: u64,
left_sorted: Vec<Annotation>, right_sorted: Vec<Annotation>, left: Option<Box<IntervalTreeNode>>,
right: Option<Box<IntervalTreeNode>>,
}
#[derive(Debug, Clone)]
pub struct IntervalTree {
root: Option<IntervalTreeNode>,
count: usize,
}
impl IntervalTree {
pub fn new() -> Self {
Self {
root: None,
count: 0,
}
}
pub fn from_annotations(annotations: Vec<Annotation>) -> Self {
let count = annotations.len();
let root = IntervalTreeNode::from_annotations(annotations);
Self { root, count }
}
pub fn query<'a>(&'a self, interval: &GenomicInterval) -> Vec<&'a str> {
let mut results = Vec::new();
if let Some(ref root) = self.root {
root.query(interval, &mut results);
}
results
}
pub fn query_annotations<'a>(&'a self, interval: &GenomicInterval) -> Vec<&'a Annotation> {
let mut results = Vec::new();
if let Some(ref root) = self.root {
root.query_annotations(interval, &mut results);
}
results
}
pub fn len(&self) -> usize {
self.count
}
pub fn is_empty(&self) -> bool {
self.count == 0
}
}
impl Default for IntervalTree {
fn default() -> Self {
Self::new()
}
}
impl IntervalTreeNode {
fn from_annotations(annotations: Vec<Annotation>) -> Option<Self> {
if annotations.is_empty() {
return None;
}
if annotations.len() <= 64 {
let mut left_sorted = annotations.clone();
let mut right_sorted = annotations;
left_sorted.sort_by_key(|a| a.interval.start);
right_sorted.sort_by_key(|a| a.interval.end);
let center = left_sorted[left_sorted.len() / 2].interval.start;
return Some(IntervalTreeNode {
center,
left_sorted,
right_sorted,
left: None,
right: None,
});
}
let mut coords: Vec<u64> = annotations
.iter()
.flat_map(|a| vec![a.interval.start, a.interval.end])
.collect();
coords.sort_unstable();
let center = coords[coords.len() / 2];
let mut left_intervals = Vec::new();
let mut right_intervals = Vec::new();
let mut center_intervals = Vec::new();
for ann in annotations {
if ann.interval.end <= center {
left_intervals.push(ann);
} else if ann.interval.start > center {
right_intervals.push(ann);
} else {
center_intervals.push(ann);
}
}
let mut left_sorted = center_intervals.clone();
let mut right_sorted = center_intervals;
left_sorted.sort_by_key(|a| a.interval.start);
right_sorted.sort_by_key(|a| a.interval.end);
Some(IntervalTreeNode {
center,
left_sorted,
right_sorted,
left: IntervalTreeNode::from_annotations(left_intervals).map(Box::new),
right: IntervalTreeNode::from_annotations(right_intervals).map(Box::new),
})
}
fn query<'a>(&'a self, interval: &GenomicInterval, results: &mut Vec<&'a str>) {
if interval.end <= self.center {
for ann in &self.left_sorted {
if ann.interval.start >= interval.end {
break; }
if ann.interval.overlaps(interval) {
results.push(&ann.data);
}
}
} else if interval.start > self.center {
for ann in self.right_sorted.iter().rev() {
if ann.interval.end <= interval.start {
break; }
if ann.interval.overlaps(interval) {
results.push(&ann.data);
}
}
} else {
for ann in &self.left_sorted {
if ann.interval.overlaps(interval) {
results.push(&ann.data);
}
}
}
if interval.start < self.center {
if let Some(ref left) = self.left {
left.query(interval, results);
}
}
if interval.end > self.center {
if let Some(ref right) = self.right {
right.query(interval, results);
}
}
}
fn query_annotations<'a>(
&'a self,
interval: &GenomicInterval,
results: &mut Vec<&'a Annotation>,
) {
if interval.end <= self.center {
for ann in &self.left_sorted {
if ann.interval.start >= interval.end {
break;
}
if ann.interval.overlaps(interval) {
results.push(ann);
}
}
} else if interval.start > self.center {
for ann in self.right_sorted.iter().rev() {
if ann.interval.end <= interval.start {
break;
}
if ann.interval.overlaps(interval) {
results.push(ann);
}
}
} else {
for ann in &self.left_sorted {
if ann.interval.overlaps(interval) {
results.push(ann);
}
}
}
if interval.start < self.center {
if let Some(ref left) = self.left {
left.query_annotations(interval, results);
}
}
if interval.end > self.center {
if let Some(ref right) = self.right {
right.query_annotations(interval, results);
}
}
}
}
#[derive(Debug, Clone)]
pub struct AnnotationIndex {
trees: HashMap<String, IntervalTreeNode>,
count: usize,
}
impl AnnotationIndex {
pub fn new() -> Self {
Self {
trees: HashMap::new(),
count: 0,
}
}
pub fn build_trees(&mut self, annotations: HashMap<String, Vec<Annotation>>) {
self.count = annotations.values().map(|v| v.len()).sum();
for (chrom, ann_list) in annotations {
if let Some(tree) = IntervalTreeNode::from_annotations(ann_list) {
self.trees.insert(chrom, tree);
}
}
}
pub fn from_bed<P, F>(path: P, extractor: F) -> Result<Self>
where
P: AsRef<Path>,
F: Fn(&BedRecord) -> String,
{
let mut reader = BedReader::from_path(path)?;
let mut annotations: HashMap<String, Vec<Annotation>> = HashMap::new();
while let Some(record) = reader.next_record()? {
let interval: GenomicInterval = (&record).into();
let data = extractor(&record);
annotations
.entry(interval.chrom.clone())
.or_insert_with(Vec::new)
.push(Annotation { interval, data });
}
let mut index = Self::new();
index.build_trees(annotations);
Ok(index)
}
pub fn from_records<'a, I, F>(records: I, extractor: F) -> Self
where
I: IntoIterator<Item = &'a BedRecord>,
F: Fn(&BedRecord) -> String,
{
let mut annotations: HashMap<String, Vec<Annotation>> = HashMap::new();
for record in records {
let interval: GenomicInterval = record.into();
let data = extractor(record);
annotations
.entry(interval.chrom.clone())
.or_insert_with(Vec::new)
.push(Annotation { interval, data });
}
let mut index = Self::new();
index.build_trees(annotations);
index
}
pub fn from_reader<I, F>(mut reader: I, extractor: F) -> Result<Self>
where
I: crate::core::GenomicRecordIterator<Record = BedRecord>,
F: Fn(&BedRecord) -> String,
{
let mut annotations: HashMap<String, Vec<Annotation>> = HashMap::new();
while let Some(record) = reader.next_record()? {
let interval: GenomicInterval = (&record).into();
let data = extractor(&record);
annotations
.entry(interval.chrom.clone())
.or_insert_with(Vec::new)
.push(Annotation { interval, data });
}
let mut index = Self::new();
index.build_trees(annotations);
Ok(index)
}
pub fn query(&self, interval: &GenomicInterval) -> Vec<&str> {
let mut results = Vec::new();
if let Some(tree) = self.trees.get(&interval.chrom) {
tree.query(interval, &mut results);
return results;
}
let alt_chrom = if interval.chrom.starts_with("chr") {
interval.chrom.trim_start_matches("chr").to_string()
} else {
format!("chr{}", interval.chrom)
};
if let Some(tree) = self.trees.get(&alt_chrom) {
let normalized_query = GenomicInterval {
chrom: alt_chrom,
start: interval.start,
end: interval.end,
};
tree.query(&normalized_query, &mut results);
}
results
}
pub fn query_annotations(&self, interval: &GenomicInterval) -> Vec<&Annotation> {
let mut results = Vec::new();
if let Some(tree) = self.trees.get(&interval.chrom) {
tree.query_annotations(interval, &mut results);
return results;
}
let alt_chrom = if interval.chrom.starts_with("chr") {
interval.chrom.trim_start_matches("chr").to_string()
} else {
format!("chr{}", interval.chrom)
};
if let Some(tree) = self.trees.get(&alt_chrom) {
let normalized_query = GenomicInterval {
chrom: alt_chrom,
start: interval.start,
end: interval.end,
};
tree.query_annotations(&normalized_query, &mut results);
}
results
}
pub fn len(&self) -> usize {
self.count
}
pub fn is_empty(&self) -> bool {
self.count == 0
}
pub fn chromosomes(&self) -> Vec<&str> {
self.trees.keys().map(|s| s.as_str()).collect()
}
}
impl Default for AnnotationIndex {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
pub struct MultiQueryResult<'a> {
pub genes: Vec<&'a str>,
pub exons: Vec<&'a str>,
}
pub fn multi_query<'a>(
interval: &GenomicInterval,
genes: &'a AnnotationIndex,
exons: &'a AnnotationIndex,
) -> MultiQueryResult<'a> {
let gene_results = genes.query(interval);
let exon_results = exons.query(interval);
MultiQueryResult {
genes: gene_results,
exons: exon_results,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_annotation_index() {
let mut annotations: HashMap<String, Vec<Annotation>> = HashMap::new();
annotations
.entry("chr1".to_string())
.or_insert_with(Vec::new)
.push(Annotation {
interval: GenomicInterval::new("chr1", 100, 200),
data: "GeneA".to_string(),
});
annotations
.entry("chr1".to_string())
.or_insert_with(Vec::new)
.push(Annotation {
interval: GenomicInterval::new("chr1", 150, 250),
data: "GeneB".to_string(),
});
annotations
.entry("chr2".to_string())
.or_insert_with(Vec::new)
.push(Annotation {
interval: GenomicInterval::new("chr2", 100, 200),
data: "GeneC".to_string(),
});
let mut index = AnnotationIndex::new();
index.build_trees(annotations);
let query = GenomicInterval::new("chr1", 175, 225);
let results = index.query(&query);
assert_eq!(results.len(), 2);
assert!(results.contains(&"GeneA"));
assert!(results.contains(&"GeneB"));
let query = GenomicInterval::new("chr1", 300, 400);
let results = index.query(&query);
assert_eq!(results.len(), 0);
let query = GenomicInterval::new("chr2", 150, 250);
let results = index.query(&query);
assert_eq!(results.len(), 1);
assert!(results.contains(&"GeneC"));
}
#[test]
fn test_annotation_index_stats() {
let mut annotations: HashMap<String, Vec<Annotation>> = HashMap::new();
annotations
.entry("chr1".to_string())
.or_insert_with(Vec::new)
.push(Annotation {
interval: GenomicInterval::new("chr1", 100, 200),
data: "GeneA".to_string(),
});
annotations
.entry("chr2".to_string())
.or_insert_with(Vec::new)
.push(Annotation {
interval: GenomicInterval::new("chr2", 100, 200),
data: "GeneB".to_string(),
});
let mut index = AnnotationIndex::new();
index.build_trees(annotations);
assert!(!index.is_empty());
assert_eq!(index.len(), 2);
let chroms = index.chromosomes();
assert_eq!(chroms.len(), 2);
assert!(chroms.contains(&"chr1"));
assert!(chroms.contains(&"chr2"));
}
#[test]
fn test_multi_query() {
let mut gene_annotations: HashMap<String, Vec<Annotation>> = HashMap::new();
let mut exon_annotations: HashMap<String, Vec<Annotation>> = HashMap::new();
gene_annotations
.entry("chr1".to_string())
.or_insert_with(Vec::new)
.push(Annotation {
interval: GenomicInterval::new("chr1", 100, 500),
data: "GeneA".to_string(),
});
exon_annotations
.entry("chr1".to_string())
.or_insert_with(Vec::new)
.push(Annotation {
interval: GenomicInterval::new("chr1", 150, 200),
data: "ExonA1".to_string(),
});
let mut genes = AnnotationIndex::new();
genes.build_trees(gene_annotations);
let mut exons = AnnotationIndex::new();
exons.build_trees(exon_annotations);
let query = GenomicInterval::new("chr1", 175, 225);
let results = multi_query(&query, &genes, &exons);
assert_eq!(results.genes.len(), 1);
assert_eq!(results.exons.len(), 1);
assert_eq!(results.genes[0], "GeneA");
assert_eq!(results.exons[0], "ExonA1");
}
}