Skip to main content

genomicframe_core/interval/
annotation.rs

1//! Genomic interval annotation
2//!
3//! This module provides infrastructure for annotating genomic intervals
4//! with contextual information from reference databases (genes, exons, etc.).
5//!
6//! # Design Philosophy
7//!
8//! - **Lazy by default**: Annotations are computed on-demand during iteration
9//! - **Format-agnostic**: Works with any format that converts to GenomicInterval
10//! - **Composable**: Multiple annotation sources can be chained
11//! - **Fast queries**: O(log n + k) overlap queries using interval trees
12//!
13//! # Examples
14//!
15//! ```no_run
16//! use genomicframe_core::interval::annotation::AnnotationIndex;
17//! use genomicframe_core::formats::bed::BedReader;
18//!
19//! // Build annotation index from BED file
20//! let exons = AnnotationIndex::from_bed("exons.bed", |record| {
21//!     record.name.clone().unwrap_or_else(|| "unknown".to_string())
22//! })?;
23//!
24//! // Query for overlapping annotations
25//! let interval = GenomicInterval::new("chr1", 1000, 2000);
26//! let genes = exons.query(&interval);
27//! # use genomicframe_core::interval::GenomicInterval;
28//! # Ok::<(), genomicframe_core::error::Error>(())
29//! ```
30
31use crate::core::GenomicRecordIterator;
32use crate::error::Result;
33use crate::formats::bed::{BedReader, BedRecord};
34use crate::interval::GenomicInterval;
35use std::collections::HashMap;
36use std::path::Path;
37
38/// Annotation metadata attached to an interval
39#[derive(Debug, Clone, PartialEq, Eq)]
40pub struct Annotation {
41    /// The genomic interval
42    pub interval: GenomicInterval,
43    /// Free-form annotation data (gene name, feature type, etc.)
44    pub data: String,
45}
46
47/// Interval tree node for O(log n + k) overlap queries
48#[derive(Debug, Clone)]
49pub struct IntervalTreeNode {
50    center: u64,
51    left_sorted: Vec<Annotation>,  // Sorted by start position
52    right_sorted: Vec<Annotation>, // Sorted by end position
53    left: Option<Box<IntervalTreeNode>>,
54    right: Option<Box<IntervalTreeNode>>,
55}
56
57/// A standalone interval tree for efficient overlap queries
58///
59/// This provides direct access to the tree structure for advanced use cases.
60#[derive(Debug, Clone)]
61pub struct IntervalTree {
62    root: Option<IntervalTreeNode>,
63    count: usize,
64}
65
66impl IntervalTree {
67    /// Create an empty interval tree
68    pub fn new() -> Self {
69        Self {
70            root: None,
71            count: 0,
72        }
73    }
74
75    /// Build interval tree from a collection of annotations
76    pub fn from_annotations(annotations: Vec<Annotation>) -> Self {
77        let count = annotations.len();
78        let root = IntervalTreeNode::from_annotations(annotations);
79        Self { root, count }
80    }
81
82    /// Query for overlapping annotations
83    pub fn query<'a>(&'a self, interval: &GenomicInterval) -> Vec<&'a str> {
84        let mut results = Vec::new();
85        if let Some(ref root) = self.root {
86            root.query(interval, &mut results);
87        }
88        results
89    }
90
91    /// Query for overlapping annotations (full structs)
92    pub fn query_annotations<'a>(&'a self, interval: &GenomicInterval) -> Vec<&'a Annotation> {
93        let mut results = Vec::new();
94        if let Some(ref root) = self.root {
95            root.query_annotations(interval, &mut results);
96        }
97        results
98    }
99
100    /// Count of annotations in the tree
101    pub fn len(&self) -> usize {
102        self.count
103    }
104
105    /// Check if tree is empty
106    pub fn is_empty(&self) -> bool {
107        self.count == 0
108    }
109}
110
111impl Default for IntervalTree {
112    fn default() -> Self {
113        Self::new()
114    }
115}
116
117impl IntervalTreeNode {
118    /// Build interval tree from a list of annotations
119    fn from_annotations(annotations: Vec<Annotation>) -> Option<Self> {
120        if annotations.is_empty() {
121            return None;
122        }
123
124        // Base case: for small sets, stop recursing and store directly
125        // This prevents stack overflow and improves performance for leaf nodes
126        // With a threshold of 64, max tree depth is ~log2(N/64), so for 32K items
127        // that's ~9 levels, well within stack limits
128        if annotations.len() <= 64 {
129            let mut left_sorted = annotations.clone();
130            let mut right_sorted = annotations;
131            left_sorted.sort_by_key(|a| a.interval.start);
132            right_sorted.sort_by_key(|a| a.interval.end);
133
134            let center = left_sorted[left_sorted.len() / 2].interval.start;
135
136            return Some(IntervalTreeNode {
137                center,
138                left_sorted,
139                right_sorted,
140                left: None,
141                right: None,
142            });
143        }
144
145        // Find center point (median of all start/end coordinates)
146        let mut coords: Vec<u64> = annotations
147            .iter()
148            .flat_map(|a| vec![a.interval.start, a.interval.end])
149            .collect();
150        coords.sort_unstable();
151        let center = coords[coords.len() / 2];
152
153        // Partition intervals
154        let mut left_intervals = Vec::new();
155        let mut right_intervals = Vec::new();
156        let mut center_intervals = Vec::new();
157
158        for ann in annotations {
159            if ann.interval.end <= center {
160                left_intervals.push(ann);
161            } else if ann.interval.start > center {
162                right_intervals.push(ann);
163            } else {
164                // Overlaps center
165                center_intervals.push(ann);
166            }
167        }
168
169        // Sort center intervals for efficient stabbing queries
170        let mut left_sorted = center_intervals.clone();
171        let mut right_sorted = center_intervals;
172
173        left_sorted.sort_by_key(|a| a.interval.start);
174        right_sorted.sort_by_key(|a| a.interval.end);
175
176        Some(IntervalTreeNode {
177            center,
178            left_sorted,
179            right_sorted,
180            left: IntervalTreeNode::from_annotations(left_intervals).map(Box::new),
181            right: IntervalTreeNode::from_annotations(right_intervals).map(Box::new),
182        })
183    }
184
185    /// Query for overlapping intervals
186    fn query<'a>(&'a self, interval: &GenomicInterval, results: &mut Vec<&'a str>) {
187        // Check intervals that span the center
188        if interval.end <= self.center {
189            // Query left of center
190            for ann in &self.left_sorted {
191                if ann.interval.start >= interval.end {
192                    break; // No more overlaps possible
193                }
194                if ann.interval.overlaps(interval) {
195                    results.push(&ann.data);
196                }
197            }
198        } else if interval.start > self.center {
199            // Query right of center
200            for ann in self.right_sorted.iter().rev() {
201                if ann.interval.end <= interval.start {
202                    break; // No more overlaps possible
203                }
204                if ann.interval.overlaps(interval) {
205                    results.push(&ann.data);
206                }
207            }
208        } else {
209            // Query spans center - check all center intervals
210            for ann in &self.left_sorted {
211                if ann.interval.overlaps(interval) {
212                    results.push(&ann.data);
213                }
214            }
215        }
216
217        // Recurse into subtrees
218        if interval.start < self.center {
219            if let Some(ref left) = self.left {
220                left.query(interval, results);
221            }
222        }
223        if interval.end > self.center {
224            if let Some(ref right) = self.right {
225                right.query(interval, results);
226            }
227        }
228    }
229
230    /// Query for overlapping annotations (full structs)
231    fn query_annotations<'a>(
232        &'a self,
233        interval: &GenomicInterval,
234        results: &mut Vec<&'a Annotation>,
235    ) {
236        // Check intervals that span the center
237        if interval.end <= self.center {
238            for ann in &self.left_sorted {
239                if ann.interval.start >= interval.end {
240                    break;
241                }
242                if ann.interval.overlaps(interval) {
243                    results.push(ann);
244                }
245            }
246        } else if interval.start > self.center {
247            for ann in self.right_sorted.iter().rev() {
248                if ann.interval.end <= interval.start {
249                    break;
250                }
251                if ann.interval.overlaps(interval) {
252                    results.push(ann);
253                }
254            }
255        } else {
256            for ann in &self.left_sorted {
257                if ann.interval.overlaps(interval) {
258                    results.push(ann);
259                }
260            }
261        }
262
263        if interval.start < self.center {
264            if let Some(ref left) = self.left {
265                left.query_annotations(interval, results);
266            }
267        }
268        if interval.end > self.center {
269            if let Some(ref right) = self.right {
270                right.query_annotations(interval, results);
271            }
272        }
273    }
274}
275
276/// Fast interval index using interval trees for O(log n + k) overlap queries
277///
278/// This implementation uses a centered interval tree per chromosome,
279/// providing logarithmic query time instead of linear scanning.
280#[derive(Debug, Clone)]
281pub struct AnnotationIndex {
282    /// Interval trees grouped by chromosome
283    trees: HashMap<String, IntervalTreeNode>,
284    /// Total annotation count (cached for efficiency)
285    count: usize,
286}
287
288impl AnnotationIndex {
289    /// Create an empty annotation index
290    pub fn new() -> Self {
291        Self {
292            trees: HashMap::new(),
293            count: 0,
294        }
295    }
296
297    /// Build interval trees from collected annotations
298    ///
299    /// This should be called after all insertions are complete.
300    /// For efficiency, batch all insertions then call this once.
301    pub fn build_trees(&mut self, annotations: HashMap<String, Vec<Annotation>>) {
302        self.count = annotations.values().map(|v| v.len()).sum();
303
304        for (chrom, ann_list) in annotations {
305            if let Some(tree) = IntervalTreeNode::from_annotations(ann_list) {
306                self.trees.insert(chrom, tree);
307            }
308        }
309    }
310
311    /// Build annotation index from BED file
312    ///
313    /// The `extractor` function converts each BED record into annotation data.
314    /// Typically this extracts the "name" field, but can be customized.
315    ///
316    /// # Examples
317    ///
318    /// ```no_run
319    /// use genomicframe_core::interval::annotation::AnnotationIndex;
320    ///
321    /// // Extract gene names from BED name field
322    /// let index = AnnotationIndex::from_bed("genes.bed", |record| {
323    ///     record.name.clone().unwrap_or_default()
324    /// })?;
325    ///
326    /// // Custom extraction: combine name and score
327    /// let index = AnnotationIndex::from_bed("features.bed", |record| {
328    ///     format!("{}:{}",
329    ///         record.name.as_deref().unwrap_or("unknown"),
330    ///         record.score.unwrap_or(0))
331    /// })?;
332    /// # Ok::<(), genomicframe_core::error::Error>(())
333    /// ```
334    pub fn from_bed<P, F>(path: P, extractor: F) -> Result<Self>
335    where
336        P: AsRef<Path>,
337        F: Fn(&BedRecord) -> String,
338    {
339        let mut reader = BedReader::from_path(path)?;
340        let mut annotations: HashMap<String, Vec<Annotation>> = HashMap::new();
341
342        while let Some(record) = reader.next_record()? {
343            let interval: GenomicInterval = (&record).into();
344            let data = extractor(&record);
345
346            annotations
347                .entry(interval.chrom.clone())
348                .or_insert_with(Vec::new)
349                .push(Annotation { interval, data });
350        }
351
352        let mut index = Self::new();
353        index.build_trees(annotations);
354        Ok(index)
355    }
356
357    /// Build annotation index from an iterator of BED records
358    pub fn from_records<'a, I, F>(records: I, extractor: F) -> Self
359    where
360        I: IntoIterator<Item = &'a BedRecord>,
361        F: Fn(&BedRecord) -> String,
362    {
363        let mut annotations: HashMap<String, Vec<Annotation>> = HashMap::new();
364
365        for record in records {
366            let interval: GenomicInterval = record.into();
367            let data = extractor(record);
368
369            annotations
370                .entry(interval.chrom.clone())
371                .or_insert_with(Vec::new)
372                .push(Annotation { interval, data });
373        }
374
375        let mut index = Self::new();
376        index.build_trees(annotations);
377        index
378    }
379
380    /// Build annotation index from a reader (possibly filtered)
381    pub fn from_reader<I, F>(mut reader: I, extractor: F) -> Result<Self>
382    where
383        I: crate::core::GenomicRecordIterator<Record = BedRecord>,
384        F: Fn(&BedRecord) -> String,
385    {
386        let mut annotations: HashMap<String, Vec<Annotation>> = HashMap::new();
387
388        while let Some(record) = reader.next_record()? {
389            let interval: GenomicInterval = (&record).into();
390            let data = extractor(&record);
391
392            annotations
393                .entry(interval.chrom.clone())
394                .or_insert_with(Vec::new)
395                .push(Annotation { interval, data });
396        }
397
398        let mut index = Self::new();
399        index.build_trees(annotations);
400        Ok(index)
401    }
402
403    /// Query for all annotations overlapping the given interval
404    ///
405    /// Returns references to annotation data strings.
406    ///
407    /// # Performance
408    /// O(log n + k) where n = number of annotations on the chromosome,
409    /// k = number of overlapping annotations
410    pub fn query(&self, interval: &GenomicInterval) -> Vec<&str> {
411        let mut results = Vec::new();
412
413        // Try exact chrom match first
414        if let Some(tree) = self.trees.get(&interval.chrom) {
415            tree.query(interval, &mut results);
416            return results;
417        }
418
419        // Fallback: try alternate chrom name (add/remove leading "chr")
420        let alt_chrom = if interval.chrom.starts_with("chr") {
421            interval.chrom.trim_start_matches("chr").to_string()
422        } else {
423            format!("chr{}", interval.chrom)
424        };
425
426        if let Some(tree) = self.trees.get(&alt_chrom) {
427            // Create a normalized query interval for overlap checking
428            let normalized_query = GenomicInterval {
429                chrom: alt_chrom,
430                start: interval.start,
431                end: interval.end,
432            };
433
434            tree.query(&normalized_query, &mut results);
435        }
436
437        results
438    }
439
440    /// Query for all annotations overlapping the given interval
441    ///
442    /// Returns full Annotation structs (includes intervals).
443    pub fn query_annotations(&self, interval: &GenomicInterval) -> Vec<&Annotation> {
444        let mut results = Vec::new();
445
446        // Try exact chrom match first
447        if let Some(tree) = self.trees.get(&interval.chrom) {
448            tree.query_annotations(interval, &mut results);
449            return results;
450        }
451
452        // Fallback: try alternate chrom name
453        let alt_chrom = if interval.chrom.starts_with("chr") {
454            interval.chrom.trim_start_matches("chr").to_string()
455        } else {
456            format!("chr{}", interval.chrom)
457        };
458
459        if let Some(tree) = self.trees.get(&alt_chrom) {
460            let normalized_query = GenomicInterval {
461                chrom: alt_chrom,
462                start: interval.start,
463                end: interval.end,
464            };
465
466            tree.query_annotations(&normalized_query, &mut results);
467        }
468
469        results
470    }
471
472    /// Count total annotations in the index
473    pub fn len(&self) -> usize {
474        self.count
475    }
476
477    /// Check if the index is empty
478    pub fn is_empty(&self) -> bool {
479        self.count == 0
480    }
481
482    /// Get chromosomes present in the index
483    pub fn chromosomes(&self) -> Vec<&str> {
484        self.trees.keys().map(|s| s.as_str()).collect()
485    }
486}
487
488impl Default for AnnotationIndex {
489    fn default() -> Self {
490        Self::new()
491    }
492}
493
494/// Multi-index query result for efficient batch queries
495#[derive(Debug, Clone)]
496pub struct MultiQueryResult<'a> {
497    pub genes: Vec<&'a str>,
498    pub exons: Vec<&'a str>,
499}
500
501/// Query multiple annotation indexes simultaneously for better cache locality
502///
503/// This is more efficient than calling query() on each index separately
504/// when you need results from multiple sources.
505pub fn multi_query<'a>(
506    interval: &GenomicInterval,
507    genes: &'a AnnotationIndex,
508    exons: &'a AnnotationIndex,
509) -> MultiQueryResult<'a> {
510    // Query both indexes - compiler can potentially optimize this
511    let gene_results = genes.query(interval);
512    let exon_results = exons.query(interval);
513
514    MultiQueryResult {
515        genes: gene_results,
516        exons: exon_results,
517    }
518}
519
520#[cfg(test)]
521mod tests {
522    use super::*;
523
524    #[test]
525    fn test_annotation_index() {
526        let mut annotations: HashMap<String, Vec<Annotation>> = HashMap::new();
527
528        annotations
529            .entry("chr1".to_string())
530            .or_insert_with(Vec::new)
531            .push(Annotation {
532                interval: GenomicInterval::new("chr1", 100, 200),
533                data: "GeneA".to_string(),
534            });
535
536        annotations
537            .entry("chr1".to_string())
538            .or_insert_with(Vec::new)
539            .push(Annotation {
540                interval: GenomicInterval::new("chr1", 150, 250),
541                data: "GeneB".to_string(),
542            });
543
544        annotations
545            .entry("chr2".to_string())
546            .or_insert_with(Vec::new)
547            .push(Annotation {
548                interval: GenomicInterval::new("chr2", 100, 200),
549                data: "GeneC".to_string(),
550            });
551
552        let mut index = AnnotationIndex::new();
553        index.build_trees(annotations);
554
555        // Query overlapping interval
556        let query = GenomicInterval::new("chr1", 175, 225);
557        let results = index.query(&query);
558
559        assert_eq!(results.len(), 2);
560        assert!(results.contains(&"GeneA"));
561        assert!(results.contains(&"GeneB"));
562
563        // Query non-overlapping interval
564        let query = GenomicInterval::new("chr1", 300, 400);
565        let results = index.query(&query);
566        assert_eq!(results.len(), 0);
567
568        // Query different chromosome
569        let query = GenomicInterval::new("chr2", 150, 250);
570        let results = index.query(&query);
571        assert_eq!(results.len(), 1);
572        assert!(results.contains(&"GeneC"));
573    }
574
575    #[test]
576    fn test_annotation_index_stats() {
577        let mut annotations: HashMap<String, Vec<Annotation>> = HashMap::new();
578
579        annotations
580            .entry("chr1".to_string())
581            .or_insert_with(Vec::new)
582            .push(Annotation {
583                interval: GenomicInterval::new("chr1", 100, 200),
584                data: "GeneA".to_string(),
585            });
586
587        annotations
588            .entry("chr2".to_string())
589            .or_insert_with(Vec::new)
590            .push(Annotation {
591                interval: GenomicInterval::new("chr2", 100, 200),
592                data: "GeneB".to_string(),
593            });
594
595        let mut index = AnnotationIndex::new();
596        index.build_trees(annotations);
597
598        assert!(!index.is_empty());
599        assert_eq!(index.len(), 2);
600
601        let chroms = index.chromosomes();
602        assert_eq!(chroms.len(), 2);
603        assert!(chroms.contains(&"chr1"));
604        assert!(chroms.contains(&"chr2"));
605    }
606
607    #[test]
608    fn test_multi_query() {
609        let mut gene_annotations: HashMap<String, Vec<Annotation>> = HashMap::new();
610        let mut exon_annotations: HashMap<String, Vec<Annotation>> = HashMap::new();
611
612        gene_annotations
613            .entry("chr1".to_string())
614            .or_insert_with(Vec::new)
615            .push(Annotation {
616                interval: GenomicInterval::new("chr1", 100, 500),
617                data: "GeneA".to_string(),
618            });
619
620        exon_annotations
621            .entry("chr1".to_string())
622            .or_insert_with(Vec::new)
623            .push(Annotation {
624                interval: GenomicInterval::new("chr1", 150, 200),
625                data: "ExonA1".to_string(),
626            });
627
628        let mut genes = AnnotationIndex::new();
629        genes.build_trees(gene_annotations);
630
631        let mut exons = AnnotationIndex::new();
632        exons.build_trees(exon_annotations);
633
634        // Query both simultaneously
635        let query = GenomicInterval::new("chr1", 175, 225);
636        let results = multi_query(&query, &genes, &exons);
637
638        assert_eq!(results.genes.len(), 1);
639        assert_eq!(results.exons.len(), 1);
640        assert_eq!(results.genes[0], "GeneA");
641        assert_eq!(results.exons[0], "ExonA1");
642    }
643}