genomicframe_core/interval/
annotation.rs1use crate::core::GenomicRecordIterator;
32use crate::error::Result;
33use crate::formats::bed::{BedReader, BedRecord};
34use crate::interval::GenomicInterval;
35use std::collections::HashMap;
36use std::path::Path;
37
38#[derive(Debug, Clone, PartialEq, Eq)]
40pub struct Annotation {
41 pub interval: GenomicInterval,
43 pub data: String,
45}
46
47#[derive(Debug, Clone)]
49pub struct IntervalTreeNode {
50 center: u64,
51 left_sorted: Vec<Annotation>, right_sorted: Vec<Annotation>, left: Option<Box<IntervalTreeNode>>,
54 right: Option<Box<IntervalTreeNode>>,
55}
56
57#[derive(Debug, Clone)]
61pub struct IntervalTree {
62 root: Option<IntervalTreeNode>,
63 count: usize,
64}
65
66impl IntervalTree {
67 pub fn new() -> Self {
69 Self {
70 root: None,
71 count: 0,
72 }
73 }
74
75 pub fn from_annotations(annotations: Vec<Annotation>) -> Self {
77 let count = annotations.len();
78 let root = IntervalTreeNode::from_annotations(annotations);
79 Self { root, count }
80 }
81
82 pub fn query<'a>(&'a self, interval: &GenomicInterval) -> Vec<&'a str> {
84 let mut results = Vec::new();
85 if let Some(ref root) = self.root {
86 root.query(interval, &mut results);
87 }
88 results
89 }
90
91 pub fn query_annotations<'a>(&'a self, interval: &GenomicInterval) -> Vec<&'a Annotation> {
93 let mut results = Vec::new();
94 if let Some(ref root) = self.root {
95 root.query_annotations(interval, &mut results);
96 }
97 results
98 }
99
100 pub fn len(&self) -> usize {
102 self.count
103 }
104
105 pub fn is_empty(&self) -> bool {
107 self.count == 0
108 }
109}
110
111impl Default for IntervalTree {
112 fn default() -> Self {
113 Self::new()
114 }
115}
116
117impl IntervalTreeNode {
118 fn from_annotations(annotations: Vec<Annotation>) -> Option<Self> {
120 if annotations.is_empty() {
121 return None;
122 }
123
124 if annotations.len() <= 64 {
129 let mut left_sorted = annotations.clone();
130 let mut right_sorted = annotations;
131 left_sorted.sort_by_key(|a| a.interval.start);
132 right_sorted.sort_by_key(|a| a.interval.end);
133
134 let center = left_sorted[left_sorted.len() / 2].interval.start;
135
136 return Some(IntervalTreeNode {
137 center,
138 left_sorted,
139 right_sorted,
140 left: None,
141 right: None,
142 });
143 }
144
145 let mut coords: Vec<u64> = annotations
147 .iter()
148 .flat_map(|a| vec![a.interval.start, a.interval.end])
149 .collect();
150 coords.sort_unstable();
151 let center = coords[coords.len() / 2];
152
153 let mut left_intervals = Vec::new();
155 let mut right_intervals = Vec::new();
156 let mut center_intervals = Vec::new();
157
158 for ann in annotations {
159 if ann.interval.end <= center {
160 left_intervals.push(ann);
161 } else if ann.interval.start > center {
162 right_intervals.push(ann);
163 } else {
164 center_intervals.push(ann);
166 }
167 }
168
169 let mut left_sorted = center_intervals.clone();
171 let mut right_sorted = center_intervals;
172
173 left_sorted.sort_by_key(|a| a.interval.start);
174 right_sorted.sort_by_key(|a| a.interval.end);
175
176 Some(IntervalTreeNode {
177 center,
178 left_sorted,
179 right_sorted,
180 left: IntervalTreeNode::from_annotations(left_intervals).map(Box::new),
181 right: IntervalTreeNode::from_annotations(right_intervals).map(Box::new),
182 })
183 }
184
185 fn query<'a>(&'a self, interval: &GenomicInterval, results: &mut Vec<&'a str>) {
187 if interval.end <= self.center {
189 for ann in &self.left_sorted {
191 if ann.interval.start >= interval.end {
192 break; }
194 if ann.interval.overlaps(interval) {
195 results.push(&ann.data);
196 }
197 }
198 } else if interval.start > self.center {
199 for ann in self.right_sorted.iter().rev() {
201 if ann.interval.end <= interval.start {
202 break; }
204 if ann.interval.overlaps(interval) {
205 results.push(&ann.data);
206 }
207 }
208 } else {
209 for ann in &self.left_sorted {
211 if ann.interval.overlaps(interval) {
212 results.push(&ann.data);
213 }
214 }
215 }
216
217 if interval.start < self.center {
219 if let Some(ref left) = self.left {
220 left.query(interval, results);
221 }
222 }
223 if interval.end > self.center {
224 if let Some(ref right) = self.right {
225 right.query(interval, results);
226 }
227 }
228 }
229
230 fn query_annotations<'a>(
232 &'a self,
233 interval: &GenomicInterval,
234 results: &mut Vec<&'a Annotation>,
235 ) {
236 if interval.end <= self.center {
238 for ann in &self.left_sorted {
239 if ann.interval.start >= interval.end {
240 break;
241 }
242 if ann.interval.overlaps(interval) {
243 results.push(ann);
244 }
245 }
246 } else if interval.start > self.center {
247 for ann in self.right_sorted.iter().rev() {
248 if ann.interval.end <= interval.start {
249 break;
250 }
251 if ann.interval.overlaps(interval) {
252 results.push(ann);
253 }
254 }
255 } else {
256 for ann in &self.left_sorted {
257 if ann.interval.overlaps(interval) {
258 results.push(ann);
259 }
260 }
261 }
262
263 if interval.start < self.center {
264 if let Some(ref left) = self.left {
265 left.query_annotations(interval, results);
266 }
267 }
268 if interval.end > self.center {
269 if let Some(ref right) = self.right {
270 right.query_annotations(interval, results);
271 }
272 }
273 }
274}
275
276#[derive(Debug, Clone)]
281pub struct AnnotationIndex {
282 trees: HashMap<String, IntervalTreeNode>,
284 count: usize,
286}
287
288impl AnnotationIndex {
289 pub fn new() -> Self {
291 Self {
292 trees: HashMap::new(),
293 count: 0,
294 }
295 }
296
297 pub fn build_trees(&mut self, annotations: HashMap<String, Vec<Annotation>>) {
302 self.count = annotations.values().map(|v| v.len()).sum();
303
304 for (chrom, ann_list) in annotations {
305 if let Some(tree) = IntervalTreeNode::from_annotations(ann_list) {
306 self.trees.insert(chrom, tree);
307 }
308 }
309 }
310
311 pub fn from_bed<P, F>(path: P, extractor: F) -> Result<Self>
335 where
336 P: AsRef<Path>,
337 F: Fn(&BedRecord) -> String,
338 {
339 let mut reader = BedReader::from_path(path)?;
340 let mut annotations: HashMap<String, Vec<Annotation>> = HashMap::new();
341
342 while let Some(record) = reader.next_record()? {
343 let interval: GenomicInterval = (&record).into();
344 let data = extractor(&record);
345
346 annotations
347 .entry(interval.chrom.clone())
348 .or_insert_with(Vec::new)
349 .push(Annotation { interval, data });
350 }
351
352 let mut index = Self::new();
353 index.build_trees(annotations);
354 Ok(index)
355 }
356
357 pub fn from_records<'a, I, F>(records: I, extractor: F) -> Self
359 where
360 I: IntoIterator<Item = &'a BedRecord>,
361 F: Fn(&BedRecord) -> String,
362 {
363 let mut annotations: HashMap<String, Vec<Annotation>> = HashMap::new();
364
365 for record in records {
366 let interval: GenomicInterval = record.into();
367 let data = extractor(record);
368
369 annotations
370 .entry(interval.chrom.clone())
371 .or_insert_with(Vec::new)
372 .push(Annotation { interval, data });
373 }
374
375 let mut index = Self::new();
376 index.build_trees(annotations);
377 index
378 }
379
380 pub fn from_reader<I, F>(mut reader: I, extractor: F) -> Result<Self>
382 where
383 I: crate::core::GenomicRecordIterator<Record = BedRecord>,
384 F: Fn(&BedRecord) -> String,
385 {
386 let mut annotations: HashMap<String, Vec<Annotation>> = HashMap::new();
387
388 while let Some(record) = reader.next_record()? {
389 let interval: GenomicInterval = (&record).into();
390 let data = extractor(&record);
391
392 annotations
393 .entry(interval.chrom.clone())
394 .or_insert_with(Vec::new)
395 .push(Annotation { interval, data });
396 }
397
398 let mut index = Self::new();
399 index.build_trees(annotations);
400 Ok(index)
401 }
402
403 pub fn query(&self, interval: &GenomicInterval) -> Vec<&str> {
411 let mut results = Vec::new();
412
413 if let Some(tree) = self.trees.get(&interval.chrom) {
415 tree.query(interval, &mut results);
416 return results;
417 }
418
419 let alt_chrom = if interval.chrom.starts_with("chr") {
421 interval.chrom.trim_start_matches("chr").to_string()
422 } else {
423 format!("chr{}", interval.chrom)
424 };
425
426 if let Some(tree) = self.trees.get(&alt_chrom) {
427 let normalized_query = GenomicInterval {
429 chrom: alt_chrom,
430 start: interval.start,
431 end: interval.end,
432 };
433
434 tree.query(&normalized_query, &mut results);
435 }
436
437 results
438 }
439
440 pub fn query_annotations(&self, interval: &GenomicInterval) -> Vec<&Annotation> {
444 let mut results = Vec::new();
445
446 if let Some(tree) = self.trees.get(&interval.chrom) {
448 tree.query_annotations(interval, &mut results);
449 return results;
450 }
451
452 let alt_chrom = if interval.chrom.starts_with("chr") {
454 interval.chrom.trim_start_matches("chr").to_string()
455 } else {
456 format!("chr{}", interval.chrom)
457 };
458
459 if let Some(tree) = self.trees.get(&alt_chrom) {
460 let normalized_query = GenomicInterval {
461 chrom: alt_chrom,
462 start: interval.start,
463 end: interval.end,
464 };
465
466 tree.query_annotations(&normalized_query, &mut results);
467 }
468
469 results
470 }
471
472 pub fn len(&self) -> usize {
474 self.count
475 }
476
477 pub fn is_empty(&self) -> bool {
479 self.count == 0
480 }
481
482 pub fn chromosomes(&self) -> Vec<&str> {
484 self.trees.keys().map(|s| s.as_str()).collect()
485 }
486}
487
488impl Default for AnnotationIndex {
489 fn default() -> Self {
490 Self::new()
491 }
492}
493
494#[derive(Debug, Clone)]
496pub struct MultiQueryResult<'a> {
497 pub genes: Vec<&'a str>,
498 pub exons: Vec<&'a str>,
499}
500
501pub fn multi_query<'a>(
506 interval: &GenomicInterval,
507 genes: &'a AnnotationIndex,
508 exons: &'a AnnotationIndex,
509) -> MultiQueryResult<'a> {
510 let gene_results = genes.query(interval);
512 let exon_results = exons.query(interval);
513
514 MultiQueryResult {
515 genes: gene_results,
516 exons: exon_results,
517 }
518}
519
520#[cfg(test)]
521mod tests {
522 use super::*;
523
524 #[test]
525 fn test_annotation_index() {
526 let mut annotations: HashMap<String, Vec<Annotation>> = HashMap::new();
527
528 annotations
529 .entry("chr1".to_string())
530 .or_insert_with(Vec::new)
531 .push(Annotation {
532 interval: GenomicInterval::new("chr1", 100, 200),
533 data: "GeneA".to_string(),
534 });
535
536 annotations
537 .entry("chr1".to_string())
538 .or_insert_with(Vec::new)
539 .push(Annotation {
540 interval: GenomicInterval::new("chr1", 150, 250),
541 data: "GeneB".to_string(),
542 });
543
544 annotations
545 .entry("chr2".to_string())
546 .or_insert_with(Vec::new)
547 .push(Annotation {
548 interval: GenomicInterval::new("chr2", 100, 200),
549 data: "GeneC".to_string(),
550 });
551
552 let mut index = AnnotationIndex::new();
553 index.build_trees(annotations);
554
555 let query = GenomicInterval::new("chr1", 175, 225);
557 let results = index.query(&query);
558
559 assert_eq!(results.len(), 2);
560 assert!(results.contains(&"GeneA"));
561 assert!(results.contains(&"GeneB"));
562
563 let query = GenomicInterval::new("chr1", 300, 400);
565 let results = index.query(&query);
566 assert_eq!(results.len(), 0);
567
568 let query = GenomicInterval::new("chr2", 150, 250);
570 let results = index.query(&query);
571 assert_eq!(results.len(), 1);
572 assert!(results.contains(&"GeneC"));
573 }
574
575 #[test]
576 fn test_annotation_index_stats() {
577 let mut annotations: HashMap<String, Vec<Annotation>> = HashMap::new();
578
579 annotations
580 .entry("chr1".to_string())
581 .or_insert_with(Vec::new)
582 .push(Annotation {
583 interval: GenomicInterval::new("chr1", 100, 200),
584 data: "GeneA".to_string(),
585 });
586
587 annotations
588 .entry("chr2".to_string())
589 .or_insert_with(Vec::new)
590 .push(Annotation {
591 interval: GenomicInterval::new("chr2", 100, 200),
592 data: "GeneB".to_string(),
593 });
594
595 let mut index = AnnotationIndex::new();
596 index.build_trees(annotations);
597
598 assert!(!index.is_empty());
599 assert_eq!(index.len(), 2);
600
601 let chroms = index.chromosomes();
602 assert_eq!(chroms.len(), 2);
603 assert!(chroms.contains(&"chr1"));
604 assert!(chroms.contains(&"chr2"));
605 }
606
607 #[test]
608 fn test_multi_query() {
609 let mut gene_annotations: HashMap<String, Vec<Annotation>> = HashMap::new();
610 let mut exon_annotations: HashMap<String, Vec<Annotation>> = HashMap::new();
611
612 gene_annotations
613 .entry("chr1".to_string())
614 .or_insert_with(Vec::new)
615 .push(Annotation {
616 interval: GenomicInterval::new("chr1", 100, 500),
617 data: "GeneA".to_string(),
618 });
619
620 exon_annotations
621 .entry("chr1".to_string())
622 .or_insert_with(Vec::new)
623 .push(Annotation {
624 interval: GenomicInterval::new("chr1", 150, 200),
625 data: "ExonA1".to_string(),
626 });
627
628 let mut genes = AnnotationIndex::new();
629 genes.build_trees(gene_annotations);
630
631 let mut exons = AnnotationIndex::new();
632 exons.build_trees(exon_annotations);
633
634 let query = GenomicInterval::new("chr1", 175, 225);
636 let results = multi_query(&query, &genes, &exons);
637
638 assert_eq!(results.genes.len(), 1);
639 assert_eq!(results.exons.len(), 1);
640 assert_eq!(results.genes[0], "GeneA");
641 assert_eq!(results.exons[0], "ExonA1");
642 }
643}