Skip to main content

genomicframe_core/
interval.rs

1//! Genomic interval operations
2//!
3//! Core data structures and operations for working with genomic intervals.
4//! Used across all formats for region-based queries, overlap detection, and filtering.
5//!
6//! # Design Philosophy
7//!
8//! - **0-based coordinates**: All intervals use 0-based, half-open [start, end) coordinates
9//!   - This matches BED, BAM, and internal representations
10//!   - VCF uses 1-based coordinates but is converted during parsing
11//! - **Efficient operations**: Overlap and containment checks are O(1)
12//! - **Immutable by default**: Intervals are cheap to clone (just 3 fields)
13//! - **Cross-format compatibility**: Can convert from any genomic record type
14//!
15//! # Examples
16//!
17//! ```
18//! use genomicframe_core::interval::GenomicInterval;
19//!
20//! // Create intervals
21//! let interval1 = GenomicInterval::new("chr1", 1000, 2000);
22//! let interval2 = GenomicInterval::new("chr1", 1500, 2500);
23//!
24//! // Check overlap
25//! assert!(interval1.overlaps(&interval2));
26//!
27//! // Compute overlap length
28//! assert_eq!(interval1.overlap_length(&interval2), 500);
29//!
30//! // Get intersection
31//! let intersection = interval1.intersect(&interval2).unwrap();
32//! assert_eq!(intersection.start, 1500);
33//! assert_eq!(intersection.end, 2000);
34//! ```
35
36use crate::error::{Error, Result};
37use std::cmp::{max, min};
38use std::fmt;
39
40// Conversions from format-specific records
41pub mod conversions;
42
43// Annotation infrastructure
44pub mod annotation;
45
46/// A genomic interval representing a region on a chromosome
47///
48/// Uses 0-based, half-open [start, end) coordinates:
49/// - `start` is inclusive (0-based)
50/// - `end` is exclusive (0-based)
51///
52/// This matches BED format and is the standard for genomic intervals.
53#[derive(Debug, Clone, PartialEq, Eq, Hash)]
54pub struct GenomicInterval {
55    /// Chromosome/contig name (e.g., "chr1", "chrX", "1")
56    pub chrom: String,
57
58    /// Start position (0-based, inclusive)
59    pub start: u64,
60
61    /// End position (0-based, exclusive)
62    pub end: u64,
63}
64
65impl GenomicInterval {
66    /// Create a new genomic interval
67    ///
68    /// # Arguments
69    /// * `chrom` - Chromosome name
70    /// * `start` - Start position (0-based, inclusive)
71    /// * `end` - End position (0-based, exclusive)
72    ///
73    /// # Panics
74    /// Panics if `start > end`
75    ///
76    /// # Example
77    /// ```
78    /// use genomicframe_core::interval::GenomicInterval;
79    ///
80    /// let interval = GenomicInterval::new("chr1", 1000, 2000);
81    /// assert_eq!(interval.len(), 1000);
82    /// ```
83    pub fn new<S: Into<String>>(chrom: S, start: u64, end: u64) -> Self {
84        let interval = Self {
85            chrom: chrom.into(),
86            start,
87            end,
88        };
89        assert!(
90            start <= end,
91            "Invalid interval: start ({}) > end ({})",
92            start,
93            end
94        );
95        interval
96    }
97
98    /// Try to create a new genomic interval, returning an error if invalid
99    ///
100    /// This is the fallible version of `new()`.
101    pub fn try_new<S: Into<String>>(chrom: S, start: u64, end: u64) -> Result<Self> {
102        if start > end {
103            return Err(Error::InvalidInput(format!(
104                "Invalid interval: start ({}) > end ({})",
105                start, end
106            )));
107        }
108        Ok(Self {
109            chrom: chrom.into(),
110            start,
111            end,
112        })
113    }
114
115    /// Create a point interval (length 1) at the given position
116    ///
117    /// Useful for representing SNPs or single positions.
118    pub fn point<S: Into<String>>(chrom: S, pos: u64) -> Self {
119        Self {
120            chrom: chrom.into(),
121            start: pos,
122            end: pos + 1,
123        }
124    }
125
126    /// Get the length of this interval
127    ///
128    /// Returns `end - start`.
129    #[inline]
130    pub fn len(&self) -> u64 {
131        self.end - self.start
132    }
133
134    /// Check if this is an empty interval (start == end)
135    #[inline]
136    pub fn is_empty(&self) -> bool {
137        self.start == self.end
138    }
139
140    /// Check if this interval is a point (length 1)
141    #[inline]
142    pub fn is_point(&self) -> bool {
143        self.len() == 1
144    }
145
146    /// Check if two intervals overlap (share at least one base)
147    ///
148    /// Returns `false` if intervals are on different chromosomes.
149    ///
150    /// # Example
151    /// ```
152    /// use genomicframe_core::interval::GenomicInterval;
153    ///
154    /// let a = GenomicInterval::new("chr1", 100, 200);
155    /// let b = GenomicInterval::new("chr1", 150, 250);
156    /// assert!(a.overlaps(&b));
157    ///
158    /// let c = GenomicInterval::new("chr1", 200, 300);
159    /// assert!(!a.overlaps(&c)); // Adjacent but not overlapping
160    /// ```
161    #[inline]
162    pub fn overlaps(&self, other: &GenomicInterval) -> bool {
163        self.chrom == other.chrom && self.start < other.end && other.start < self.end
164    }
165
166    /// Check if this interval completely contains another interval
167    ///
168    /// Returns `false` if intervals are on different chromosomes.
169    ///
170    /// # Example
171    /// ```
172    /// use genomicframe_core::interval::GenomicInterval;
173    ///
174    /// let a = GenomicInterval::new("chr1", 100, 300);
175    /// let b = GenomicInterval::new("chr1", 150, 200);
176    /// assert!(a.contains(&b));
177    /// assert!(!b.contains(&a));
178    /// ```
179    #[inline]
180    pub fn contains(&self, other: &GenomicInterval) -> bool {
181        self.chrom == other.chrom && self.start <= other.start && other.end <= self.end
182    }
183
184    /// Check if this interval contains a specific position
185    ///
186    /// # Example
187    /// ```
188    /// use genomicframe_core::interval::GenomicInterval;
189    ///
190    /// let interval = GenomicInterval::new("chr1", 100, 200);
191    /// assert!(interval.contains_pos("chr1", 150));
192    /// assert!(!interval.contains_pos("chr1", 200)); // end is exclusive
193    /// ```
194    #[inline]
195    pub fn contains_pos(&self, chrom: &str, pos: u64) -> bool {
196        self.chrom == chrom && self.start <= pos && pos < self.end
197    }
198
199    /// Compute the length of overlap between two intervals
200    ///
201    /// Returns 0 if intervals don't overlap or are on different chromosomes.
202    ///
203    /// # Example
204    /// ```
205    /// use genomicframe_core::interval::GenomicInterval;
206    ///
207    /// let a = GenomicInterval::new("chr1", 100, 200);
208    /// let b = GenomicInterval::new("chr1", 150, 250);
209    /// assert_eq!(a.overlap_length(&b), 50);
210    /// ```
211    pub fn overlap_length(&self, other: &GenomicInterval) -> u64 {
212        if !self.overlaps(other) {
213            0
214        } else {
215            min(self.end, other.end) - max(self.start, other.start)
216        }
217    }
218
219    /// Compute the intersection of two intervals
220    ///
221    /// Returns `None` if intervals don't overlap or are on different chromosomes.
222    ///
223    /// # Example
224    /// ```
225    /// use genomicframe_core::interval::GenomicInterval;
226    ///
227    /// let a = GenomicInterval::new("chr1", 100, 200);
228    /// let b = GenomicInterval::new("chr1", 150, 250);
229    /// let intersection = a.intersect(&b).unwrap();
230    /// assert_eq!(intersection.start, 150);
231    /// assert_eq!(intersection.end, 200);
232    /// ```
233    pub fn intersect(&self, other: &GenomicInterval) -> Option<GenomicInterval> {
234        if !self.overlaps(other) {
235            None
236        } else {
237            Some(GenomicInterval {
238                chrom: self.chrom.clone(),
239                start: max(self.start, other.start),
240                end: min(self.end, other.end),
241            })
242        }
243    }
244
245    /// Compute the union (span) of two intervals
246    ///
247    /// Returns the smallest interval that contains both intervals.
248    /// Returns `None` if intervals are on different chromosomes.
249    ///
250    /// Note: This may include regions not covered by either original interval.
251    ///
252    /// # Example
253    /// ```
254    /// use genomicframe_core::interval::GenomicInterval;
255    ///
256    /// let a = GenomicInterval::new("chr1", 100, 200);
257    /// let b = GenomicInterval::new("chr1", 300, 400);
258    /// let union = a.union(&b).unwrap();
259    /// assert_eq!(union.start, 100);
260    /// assert_eq!(union.end, 400);
261    /// assert_eq!(union.len(), 300); // Includes gap!
262    /// ```
263    pub fn union(&self, other: &GenomicInterval) -> Option<GenomicInterval> {
264        if self.chrom != other.chrom {
265            None
266        } else {
267            Some(GenomicInterval {
268                chrom: self.chrom.clone(),
269                start: min(self.start, other.start),
270                end: max(self.end, other.end),
271            })
272        }
273    }
274
275    /// Compute the distance between two intervals
276    ///
277    /// Returns:
278    /// - `Some(0)` if intervals overlap
279    /// - `Some(n)` where n > 0 is the gap size between intervals
280    /// - `None` if intervals are on different chromosomes
281    ///
282    /// # Example
283    /// ```
284    /// use genomicframe_core::interval::GenomicInterval;
285    ///
286    /// let a = GenomicInterval::new("chr1", 100, 200);
287    /// let b = GenomicInterval::new("chr1", 250, 300);
288    /// assert_eq!(a.distance(&b), Some(50)); // 250 - 200 = 50
289    ///
290    /// let c = GenomicInterval::new("chr1", 150, 250);
291    /// assert_eq!(a.distance(&c), Some(0)); // Overlapping
292    /// ```
293    pub fn distance(&self, other: &GenomicInterval) -> Option<u64> {
294        if self.chrom != other.chrom {
295            None
296        } else if self.overlaps(other) {
297            Some(0)
298        } else if self.end <= other.start {
299            Some(other.start - self.end)
300        } else {
301            Some(self.start - other.end)
302        }
303    }
304
305    /// Expand this interval by a fixed amount on both sides
306    ///
307    /// Start is reduced by `amount`, end is increased by `amount`.
308    /// Start is clamped to 0 (won't go negative).
309    ///
310    /// # Example
311    /// ```
312    /// use genomicframe_core::interval::GenomicInterval;
313    ///
314    /// let interval = GenomicInterval::new("chr1", 100, 200);
315    /// let expanded = interval.expand(50);
316    /// assert_eq!(expanded.start, 50);
317    /// assert_eq!(expanded.end, 250);
318    /// ```
319    pub fn expand(&self, amount: u64) -> GenomicInterval {
320        GenomicInterval {
321            chrom: self.chrom.clone(),
322            start: self.start.saturating_sub(amount),
323            end: self.end + amount,
324        }
325    }
326
327    /// Shrink this interval by a fixed amount on both sides
328    ///
329    /// Returns `None` if shrinking would make the interval invalid.
330    ///
331    /// # Example
332    /// ```
333    /// use genomicframe_core::interval::GenomicInterval;
334    ///
335    /// let interval = GenomicInterval::new("chr1", 100, 200);
336    /// let shrunk = interval.shrink(25).unwrap();
337    /// assert_eq!(shrunk.start, 125);
338    /// assert_eq!(shrunk.end, 175);
339    /// ```
340    pub fn shrink(&self, amount: u64) -> Option<GenomicInterval> {
341        let new_start = self.start + amount;
342        let new_end = self.end.saturating_sub(amount);
343
344        if new_start <= new_end {
345            Some(GenomicInterval {
346                chrom: self.chrom.clone(),
347                start: new_start,
348                end: new_end,
349            })
350        } else {
351            None
352        }
353    }
354
355    /// Convert to BED format string (0-based, tab-separated)
356    ///
357    /// # Example
358    /// ```
359    /// use genomicframe_core::interval::GenomicInterval;
360    ///
361    /// let interval = GenomicInterval::new("chr1", 100, 200);
362    /// assert_eq!(interval.to_bed_string(), "chr1\t100\t200");
363    /// ```
364    pub fn to_bed_string(&self) -> String {
365        format!("{}\t{}\t{}", self.chrom, self.start, self.end)
366    }
367
368    /// Parse from BED format string (0-based, tab-separated)
369    ///
370    /// # Example
371    /// ```
372    /// use genomicframe_core::interval::GenomicInterval;
373    ///
374    /// let interval = GenomicInterval::from_bed_string("chr1\t100\t200").unwrap();
375    /// assert_eq!(interval.chrom, "chr1");
376    /// assert_eq!(interval.start, 100);
377    /// assert_eq!(interval.end, 200);
378    /// ```
379    pub fn from_bed_string(s: &str) -> Result<Self> {
380        let parts: Vec<&str> = s.split('\t').collect();
381        if parts.len() < 3 {
382            return Err(Error::Parse(format!(
383                "Invalid BED string: expected at least 3 fields, got {}",
384                parts.len()
385            )));
386        }
387
388        let chrom = parts[0].to_string();
389        let start = parts[1].parse::<u64>().map_err(|_| {
390            Error::Parse(format!("Invalid start position: {}", parts[1]))
391        })?;
392        let end = parts[2].parse::<u64>().map_err(|_| {
393            Error::Parse(format!("Invalid end position: {}", parts[2]))
394        })?;
395
396        Self::try_new(chrom, start, end)
397    }
398}
399
400impl fmt::Display for GenomicInterval {
401    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
402        write!(f, "{}:{}-{}", self.chrom, self.start, self.end)
403    }
404}
405
406/// Helper for sorting intervals
407impl PartialOrd for GenomicInterval {
408    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
409        Some(self.cmp(other))
410    }
411}
412
413impl Ord for GenomicInterval {
414    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
415        // Sort by chromosome, then start, then end
416        (&self.chrom, self.start, self.end).cmp(&(&other.chrom, other.start, other.end))
417    }
418}
419
420#[cfg(test)]
421mod tests {
422    use super::*;
423
424    #[test]
425    fn test_new() {
426        let interval = GenomicInterval::new("chr1", 100, 200);
427        assert_eq!(interval.chrom, "chr1");
428        assert_eq!(interval.start, 100);
429        assert_eq!(interval.end, 200);
430        assert_eq!(interval.len(), 100);
431    }
432
433    #[test]
434    #[should_panic]
435    fn test_invalid_interval() {
436        GenomicInterval::new("chr1", 200, 100); // start > end
437    }
438
439    #[test]
440    fn test_try_new() {
441        assert!(GenomicInterval::try_new("chr1", 200, 100).is_err());
442        assert!(GenomicInterval::try_new("chr1", 100, 200).is_ok());
443    }
444
445    #[test]
446    fn test_point() {
447        let interval = GenomicInterval::point("chr1", 100);
448        assert_eq!(interval.start, 100);
449        assert_eq!(interval.end, 101);
450        assert!(interval.is_point());
451    }
452
453    #[test]
454    fn test_overlaps() {
455        let a = GenomicInterval::new("chr1", 100, 200);
456        let b = GenomicInterval::new("chr1", 150, 250);
457        let c = GenomicInterval::new("chr1", 200, 300);
458        let d = GenomicInterval::new("chr2", 100, 200);
459
460        assert!(a.overlaps(&b));
461        assert!(!a.overlaps(&c)); // Adjacent
462        assert!(!a.overlaps(&d)); // Different chromosome
463    }
464
465    #[test]
466    fn test_contains() {
467        let a = GenomicInterval::new("chr1", 100, 300);
468        let b = GenomicInterval::new("chr1", 150, 200);
469
470        assert!(a.contains(&b));
471        assert!(!b.contains(&a));
472    }
473
474    #[test]
475    fn test_contains_pos() {
476        let interval = GenomicInterval::new("chr1", 100, 200);
477        assert!(interval.contains_pos("chr1", 100));
478        assert!(interval.contains_pos("chr1", 150));
479        assert!(!interval.contains_pos("chr1", 200)); // end is exclusive
480        assert!(!interval.contains_pos("chr2", 150));
481    }
482
483    #[test]
484    fn test_overlap_length() {
485        let a = GenomicInterval::new("chr1", 100, 200);
486        let b = GenomicInterval::new("chr1", 150, 250);
487        let c = GenomicInterval::new("chr1", 200, 300);
488
489        assert_eq!(a.overlap_length(&b), 50);
490        assert_eq!(a.overlap_length(&c), 0);
491    }
492
493    #[test]
494    fn test_intersect() {
495        let a = GenomicInterval::new("chr1", 100, 200);
496        let b = GenomicInterval::new("chr1", 150, 250);
497
498        let intersection = a.intersect(&b).unwrap();
499        assert_eq!(intersection.start, 150);
500        assert_eq!(intersection.end, 200);
501
502        let c = GenomicInterval::new("chr1", 200, 300);
503        assert!(a.intersect(&c).is_none());
504    }
505
506    #[test]
507    fn test_union() {
508        let a = GenomicInterval::new("chr1", 100, 200);
509        let b = GenomicInterval::new("chr1", 300, 400);
510
511        let union = a.union(&b).unwrap();
512        assert_eq!(union.start, 100);
513        assert_eq!(union.end, 400);
514
515        let c = GenomicInterval::new("chr2", 100, 200);
516        assert!(a.union(&c).is_none());
517    }
518
519    #[test]
520    fn test_distance() {
521        let a = GenomicInterval::new("chr1", 100, 200);
522        let b = GenomicInterval::new("chr1", 250, 300);
523        let c = GenomicInterval::new("chr1", 150, 250);
524
525        assert_eq!(a.distance(&b), Some(50));
526        assert_eq!(a.distance(&c), Some(0)); // Overlapping
527    }
528
529    #[test]
530    fn test_expand() {
531        let interval = GenomicInterval::new("chr1", 100, 200);
532        let expanded = interval.expand(50);
533        assert_eq!(expanded.start, 50);
534        assert_eq!(expanded.end, 250);
535
536        // Test saturation at 0
537        let small = GenomicInterval::new("chr1", 10, 20);
538        let expanded = small.expand(50);
539        assert_eq!(expanded.start, 0);
540    }
541
542    #[test]
543    fn test_shrink() {
544        let interval = GenomicInterval::new("chr1", 100, 200);
545        let shrunk = interval.shrink(25).unwrap();
546        assert_eq!(shrunk.start, 125);
547        assert_eq!(shrunk.end, 175);
548
549        // Shrink too much
550        assert!(interval.shrink(100).is_none());
551    }
552
553    #[test]
554    fn test_bed_string() {
555        let interval = GenomicInterval::new("chr1", 100, 200);
556        assert_eq!(interval.to_bed_string(), "chr1\t100\t200");
557
558        let parsed = GenomicInterval::from_bed_string("chr1\t100\t200").unwrap();
559        assert_eq!(parsed, interval);
560    }
561
562    #[test]
563    fn test_display() {
564        let interval = GenomicInterval::new("chr1", 100, 200);
565        assert_eq!(format!("{}", interval), "chr1:100-200");
566    }
567
568    #[test]
569    fn test_sorting() {
570        let mut intervals = vec![
571            GenomicInterval::new("chr2", 100, 200),
572            GenomicInterval::new("chr1", 300, 400),
573            GenomicInterval::new("chr1", 100, 200),
574        ];
575
576        intervals.sort();
577
578        assert_eq!(intervals[0].chrom, "chr1");
579        assert_eq!(intervals[0].start, 100);
580        assert_eq!(intervals[1].chrom, "chr1");
581        assert_eq!(intervals[1].start, 300);
582        assert_eq!(intervals[2].chrom, "chr2");
583    }
584}