genomicframe_core/interval.rs
1//! Genomic interval operations
2//!
3//! Core data structures and operations for working with genomic intervals.
4//! Used across all formats for region-based queries, overlap detection, and filtering.
5//!
6//! # Design Philosophy
7//!
8//! - **0-based coordinates**: All intervals use 0-based, half-open [start, end) coordinates
9//! - This matches BED, BAM, and internal representations
10//! - VCF uses 1-based coordinates but is converted during parsing
11//! - **Efficient operations**: Overlap and containment checks are O(1)
12//! - **Immutable by default**: Intervals are cheap to clone (just 3 fields)
13//! - **Cross-format compatibility**: Can convert from any genomic record type
14//!
15//! # Examples
16//!
17//! ```
18//! use genomicframe_core::interval::GenomicInterval;
19//!
20//! // Create intervals
21//! let interval1 = GenomicInterval::new("chr1", 1000, 2000);
22//! let interval2 = GenomicInterval::new("chr1", 1500, 2500);
23//!
24//! // Check overlap
25//! assert!(interval1.overlaps(&interval2));
26//!
27//! // Compute overlap length
28//! assert_eq!(interval1.overlap_length(&interval2), 500);
29//!
30//! // Get intersection
31//! let intersection = interval1.intersect(&interval2).unwrap();
32//! assert_eq!(intersection.start, 1500);
33//! assert_eq!(intersection.end, 2000);
34//! ```
35
36use crate::error::{Error, Result};
37use std::cmp::{max, min};
38use std::fmt;
39
40// Conversions from format-specific records
41pub mod conversions;
42
43// Annotation infrastructure
44pub mod annotation;
45
46/// A genomic interval representing a region on a chromosome
47///
48/// Uses 0-based, half-open [start, end) coordinates:
49/// - `start` is inclusive (0-based)
50/// - `end` is exclusive (0-based)
51///
52/// This matches BED format and is the standard for genomic intervals.
53#[derive(Debug, Clone, PartialEq, Eq, Hash)]
54pub struct GenomicInterval {
55 /// Chromosome/contig name (e.g., "chr1", "chrX", "1")
56 pub chrom: String,
57
58 /// Start position (0-based, inclusive)
59 pub start: u64,
60
61 /// End position (0-based, exclusive)
62 pub end: u64,
63}
64
65impl GenomicInterval {
66 /// Create a new genomic interval
67 ///
68 /// # Arguments
69 /// * `chrom` - Chromosome name
70 /// * `start` - Start position (0-based, inclusive)
71 /// * `end` - End position (0-based, exclusive)
72 ///
73 /// # Panics
74 /// Panics if `start > end`
75 ///
76 /// # Example
77 /// ```
78 /// use genomicframe_core::interval::GenomicInterval;
79 ///
80 /// let interval = GenomicInterval::new("chr1", 1000, 2000);
81 /// assert_eq!(interval.len(), 1000);
82 /// ```
83 pub fn new<S: Into<String>>(chrom: S, start: u64, end: u64) -> Self {
84 let interval = Self {
85 chrom: chrom.into(),
86 start,
87 end,
88 };
89 assert!(
90 start <= end,
91 "Invalid interval: start ({}) > end ({})",
92 start,
93 end
94 );
95 interval
96 }
97
98 /// Try to create a new genomic interval, returning an error if invalid
99 ///
100 /// This is the fallible version of `new()`.
101 pub fn try_new<S: Into<String>>(chrom: S, start: u64, end: u64) -> Result<Self> {
102 if start > end {
103 return Err(Error::InvalidInput(format!(
104 "Invalid interval: start ({}) > end ({})",
105 start, end
106 )));
107 }
108 Ok(Self {
109 chrom: chrom.into(),
110 start,
111 end,
112 })
113 }
114
115 /// Create a point interval (length 1) at the given position
116 ///
117 /// Useful for representing SNPs or single positions.
118 pub fn point<S: Into<String>>(chrom: S, pos: u64) -> Self {
119 Self {
120 chrom: chrom.into(),
121 start: pos,
122 end: pos + 1,
123 }
124 }
125
126 /// Get the length of this interval
127 ///
128 /// Returns `end - start`.
129 #[inline]
130 pub fn len(&self) -> u64 {
131 self.end - self.start
132 }
133
134 /// Check if this is an empty interval (start == end)
135 #[inline]
136 pub fn is_empty(&self) -> bool {
137 self.start == self.end
138 }
139
140 /// Check if this interval is a point (length 1)
141 #[inline]
142 pub fn is_point(&self) -> bool {
143 self.len() == 1
144 }
145
146 /// Check if two intervals overlap (share at least one base)
147 ///
148 /// Returns `false` if intervals are on different chromosomes.
149 ///
150 /// # Example
151 /// ```
152 /// use genomicframe_core::interval::GenomicInterval;
153 ///
154 /// let a = GenomicInterval::new("chr1", 100, 200);
155 /// let b = GenomicInterval::new("chr1", 150, 250);
156 /// assert!(a.overlaps(&b));
157 ///
158 /// let c = GenomicInterval::new("chr1", 200, 300);
159 /// assert!(!a.overlaps(&c)); // Adjacent but not overlapping
160 /// ```
161 #[inline]
162 pub fn overlaps(&self, other: &GenomicInterval) -> bool {
163 self.chrom == other.chrom && self.start < other.end && other.start < self.end
164 }
165
166 /// Check if this interval completely contains another interval
167 ///
168 /// Returns `false` if intervals are on different chromosomes.
169 ///
170 /// # Example
171 /// ```
172 /// use genomicframe_core::interval::GenomicInterval;
173 ///
174 /// let a = GenomicInterval::new("chr1", 100, 300);
175 /// let b = GenomicInterval::new("chr1", 150, 200);
176 /// assert!(a.contains(&b));
177 /// assert!(!b.contains(&a));
178 /// ```
179 #[inline]
180 pub fn contains(&self, other: &GenomicInterval) -> bool {
181 self.chrom == other.chrom && self.start <= other.start && other.end <= self.end
182 }
183
184 /// Check if this interval contains a specific position
185 ///
186 /// # Example
187 /// ```
188 /// use genomicframe_core::interval::GenomicInterval;
189 ///
190 /// let interval = GenomicInterval::new("chr1", 100, 200);
191 /// assert!(interval.contains_pos("chr1", 150));
192 /// assert!(!interval.contains_pos("chr1", 200)); // end is exclusive
193 /// ```
194 #[inline]
195 pub fn contains_pos(&self, chrom: &str, pos: u64) -> bool {
196 self.chrom == chrom && self.start <= pos && pos < self.end
197 }
198
199 /// Compute the length of overlap between two intervals
200 ///
201 /// Returns 0 if intervals don't overlap or are on different chromosomes.
202 ///
203 /// # Example
204 /// ```
205 /// use genomicframe_core::interval::GenomicInterval;
206 ///
207 /// let a = GenomicInterval::new("chr1", 100, 200);
208 /// let b = GenomicInterval::new("chr1", 150, 250);
209 /// assert_eq!(a.overlap_length(&b), 50);
210 /// ```
211 pub fn overlap_length(&self, other: &GenomicInterval) -> u64 {
212 if !self.overlaps(other) {
213 0
214 } else {
215 min(self.end, other.end) - max(self.start, other.start)
216 }
217 }
218
219 /// Compute the intersection of two intervals
220 ///
221 /// Returns `None` if intervals don't overlap or are on different chromosomes.
222 ///
223 /// # Example
224 /// ```
225 /// use genomicframe_core::interval::GenomicInterval;
226 ///
227 /// let a = GenomicInterval::new("chr1", 100, 200);
228 /// let b = GenomicInterval::new("chr1", 150, 250);
229 /// let intersection = a.intersect(&b).unwrap();
230 /// assert_eq!(intersection.start, 150);
231 /// assert_eq!(intersection.end, 200);
232 /// ```
233 pub fn intersect(&self, other: &GenomicInterval) -> Option<GenomicInterval> {
234 if !self.overlaps(other) {
235 None
236 } else {
237 Some(GenomicInterval {
238 chrom: self.chrom.clone(),
239 start: max(self.start, other.start),
240 end: min(self.end, other.end),
241 })
242 }
243 }
244
245 /// Compute the union (span) of two intervals
246 ///
247 /// Returns the smallest interval that contains both intervals.
248 /// Returns `None` if intervals are on different chromosomes.
249 ///
250 /// Note: This may include regions not covered by either original interval.
251 ///
252 /// # Example
253 /// ```
254 /// use genomicframe_core::interval::GenomicInterval;
255 ///
256 /// let a = GenomicInterval::new("chr1", 100, 200);
257 /// let b = GenomicInterval::new("chr1", 300, 400);
258 /// let union = a.union(&b).unwrap();
259 /// assert_eq!(union.start, 100);
260 /// assert_eq!(union.end, 400);
261 /// assert_eq!(union.len(), 300); // Includes gap!
262 /// ```
263 pub fn union(&self, other: &GenomicInterval) -> Option<GenomicInterval> {
264 if self.chrom != other.chrom {
265 None
266 } else {
267 Some(GenomicInterval {
268 chrom: self.chrom.clone(),
269 start: min(self.start, other.start),
270 end: max(self.end, other.end),
271 })
272 }
273 }
274
275 /// Compute the distance between two intervals
276 ///
277 /// Returns:
278 /// - `Some(0)` if intervals overlap
279 /// - `Some(n)` where n > 0 is the gap size between intervals
280 /// - `None` if intervals are on different chromosomes
281 ///
282 /// # Example
283 /// ```
284 /// use genomicframe_core::interval::GenomicInterval;
285 ///
286 /// let a = GenomicInterval::new("chr1", 100, 200);
287 /// let b = GenomicInterval::new("chr1", 250, 300);
288 /// assert_eq!(a.distance(&b), Some(50)); // 250 - 200 = 50
289 ///
290 /// let c = GenomicInterval::new("chr1", 150, 250);
291 /// assert_eq!(a.distance(&c), Some(0)); // Overlapping
292 /// ```
293 pub fn distance(&self, other: &GenomicInterval) -> Option<u64> {
294 if self.chrom != other.chrom {
295 None
296 } else if self.overlaps(other) {
297 Some(0)
298 } else if self.end <= other.start {
299 Some(other.start - self.end)
300 } else {
301 Some(self.start - other.end)
302 }
303 }
304
305 /// Expand this interval by a fixed amount on both sides
306 ///
307 /// Start is reduced by `amount`, end is increased by `amount`.
308 /// Start is clamped to 0 (won't go negative).
309 ///
310 /// # Example
311 /// ```
312 /// use genomicframe_core::interval::GenomicInterval;
313 ///
314 /// let interval = GenomicInterval::new("chr1", 100, 200);
315 /// let expanded = interval.expand(50);
316 /// assert_eq!(expanded.start, 50);
317 /// assert_eq!(expanded.end, 250);
318 /// ```
319 pub fn expand(&self, amount: u64) -> GenomicInterval {
320 GenomicInterval {
321 chrom: self.chrom.clone(),
322 start: self.start.saturating_sub(amount),
323 end: self.end + amount,
324 }
325 }
326
327 /// Shrink this interval by a fixed amount on both sides
328 ///
329 /// Returns `None` if shrinking would make the interval invalid.
330 ///
331 /// # Example
332 /// ```
333 /// use genomicframe_core::interval::GenomicInterval;
334 ///
335 /// let interval = GenomicInterval::new("chr1", 100, 200);
336 /// let shrunk = interval.shrink(25).unwrap();
337 /// assert_eq!(shrunk.start, 125);
338 /// assert_eq!(shrunk.end, 175);
339 /// ```
340 pub fn shrink(&self, amount: u64) -> Option<GenomicInterval> {
341 let new_start = self.start + amount;
342 let new_end = self.end.saturating_sub(amount);
343
344 if new_start <= new_end {
345 Some(GenomicInterval {
346 chrom: self.chrom.clone(),
347 start: new_start,
348 end: new_end,
349 })
350 } else {
351 None
352 }
353 }
354
355 /// Convert to BED format string (0-based, tab-separated)
356 ///
357 /// # Example
358 /// ```
359 /// use genomicframe_core::interval::GenomicInterval;
360 ///
361 /// let interval = GenomicInterval::new("chr1", 100, 200);
362 /// assert_eq!(interval.to_bed_string(), "chr1\t100\t200");
363 /// ```
364 pub fn to_bed_string(&self) -> String {
365 format!("{}\t{}\t{}", self.chrom, self.start, self.end)
366 }
367
368 /// Parse from BED format string (0-based, tab-separated)
369 ///
370 /// # Example
371 /// ```
372 /// use genomicframe_core::interval::GenomicInterval;
373 ///
374 /// let interval = GenomicInterval::from_bed_string("chr1\t100\t200").unwrap();
375 /// assert_eq!(interval.chrom, "chr1");
376 /// assert_eq!(interval.start, 100);
377 /// assert_eq!(interval.end, 200);
378 /// ```
379 pub fn from_bed_string(s: &str) -> Result<Self> {
380 let parts: Vec<&str> = s.split('\t').collect();
381 if parts.len() < 3 {
382 return Err(Error::Parse(format!(
383 "Invalid BED string: expected at least 3 fields, got {}",
384 parts.len()
385 )));
386 }
387
388 let chrom = parts[0].to_string();
389 let start = parts[1].parse::<u64>().map_err(|_| {
390 Error::Parse(format!("Invalid start position: {}", parts[1]))
391 })?;
392 let end = parts[2].parse::<u64>().map_err(|_| {
393 Error::Parse(format!("Invalid end position: {}", parts[2]))
394 })?;
395
396 Self::try_new(chrom, start, end)
397 }
398}
399
400impl fmt::Display for GenomicInterval {
401 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
402 write!(f, "{}:{}-{}", self.chrom, self.start, self.end)
403 }
404}
405
406/// Helper for sorting intervals
407impl PartialOrd for GenomicInterval {
408 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
409 Some(self.cmp(other))
410 }
411}
412
413impl Ord for GenomicInterval {
414 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
415 // Sort by chromosome, then start, then end
416 (&self.chrom, self.start, self.end).cmp(&(&other.chrom, other.start, other.end))
417 }
418}
419
420#[cfg(test)]
421mod tests {
422 use super::*;
423
424 #[test]
425 fn test_new() {
426 let interval = GenomicInterval::new("chr1", 100, 200);
427 assert_eq!(interval.chrom, "chr1");
428 assert_eq!(interval.start, 100);
429 assert_eq!(interval.end, 200);
430 assert_eq!(interval.len(), 100);
431 }
432
433 #[test]
434 #[should_panic]
435 fn test_invalid_interval() {
436 GenomicInterval::new("chr1", 200, 100); // start > end
437 }
438
439 #[test]
440 fn test_try_new() {
441 assert!(GenomicInterval::try_new("chr1", 200, 100).is_err());
442 assert!(GenomicInterval::try_new("chr1", 100, 200).is_ok());
443 }
444
445 #[test]
446 fn test_point() {
447 let interval = GenomicInterval::point("chr1", 100);
448 assert_eq!(interval.start, 100);
449 assert_eq!(interval.end, 101);
450 assert!(interval.is_point());
451 }
452
453 #[test]
454 fn test_overlaps() {
455 let a = GenomicInterval::new("chr1", 100, 200);
456 let b = GenomicInterval::new("chr1", 150, 250);
457 let c = GenomicInterval::new("chr1", 200, 300);
458 let d = GenomicInterval::new("chr2", 100, 200);
459
460 assert!(a.overlaps(&b));
461 assert!(!a.overlaps(&c)); // Adjacent
462 assert!(!a.overlaps(&d)); // Different chromosome
463 }
464
465 #[test]
466 fn test_contains() {
467 let a = GenomicInterval::new("chr1", 100, 300);
468 let b = GenomicInterval::new("chr1", 150, 200);
469
470 assert!(a.contains(&b));
471 assert!(!b.contains(&a));
472 }
473
474 #[test]
475 fn test_contains_pos() {
476 let interval = GenomicInterval::new("chr1", 100, 200);
477 assert!(interval.contains_pos("chr1", 100));
478 assert!(interval.contains_pos("chr1", 150));
479 assert!(!interval.contains_pos("chr1", 200)); // end is exclusive
480 assert!(!interval.contains_pos("chr2", 150));
481 }
482
483 #[test]
484 fn test_overlap_length() {
485 let a = GenomicInterval::new("chr1", 100, 200);
486 let b = GenomicInterval::new("chr1", 150, 250);
487 let c = GenomicInterval::new("chr1", 200, 300);
488
489 assert_eq!(a.overlap_length(&b), 50);
490 assert_eq!(a.overlap_length(&c), 0);
491 }
492
493 #[test]
494 fn test_intersect() {
495 let a = GenomicInterval::new("chr1", 100, 200);
496 let b = GenomicInterval::new("chr1", 150, 250);
497
498 let intersection = a.intersect(&b).unwrap();
499 assert_eq!(intersection.start, 150);
500 assert_eq!(intersection.end, 200);
501
502 let c = GenomicInterval::new("chr1", 200, 300);
503 assert!(a.intersect(&c).is_none());
504 }
505
506 #[test]
507 fn test_union() {
508 let a = GenomicInterval::new("chr1", 100, 200);
509 let b = GenomicInterval::new("chr1", 300, 400);
510
511 let union = a.union(&b).unwrap();
512 assert_eq!(union.start, 100);
513 assert_eq!(union.end, 400);
514
515 let c = GenomicInterval::new("chr2", 100, 200);
516 assert!(a.union(&c).is_none());
517 }
518
519 #[test]
520 fn test_distance() {
521 let a = GenomicInterval::new("chr1", 100, 200);
522 let b = GenomicInterval::new("chr1", 250, 300);
523 let c = GenomicInterval::new("chr1", 150, 250);
524
525 assert_eq!(a.distance(&b), Some(50));
526 assert_eq!(a.distance(&c), Some(0)); // Overlapping
527 }
528
529 #[test]
530 fn test_expand() {
531 let interval = GenomicInterval::new("chr1", 100, 200);
532 let expanded = interval.expand(50);
533 assert_eq!(expanded.start, 50);
534 assert_eq!(expanded.end, 250);
535
536 // Test saturation at 0
537 let small = GenomicInterval::new("chr1", 10, 20);
538 let expanded = small.expand(50);
539 assert_eq!(expanded.start, 0);
540 }
541
542 #[test]
543 fn test_shrink() {
544 let interval = GenomicInterval::new("chr1", 100, 200);
545 let shrunk = interval.shrink(25).unwrap();
546 assert_eq!(shrunk.start, 125);
547 assert_eq!(shrunk.end, 175);
548
549 // Shrink too much
550 assert!(interval.shrink(100).is_none());
551 }
552
553 #[test]
554 fn test_bed_string() {
555 let interval = GenomicInterval::new("chr1", 100, 200);
556 assert_eq!(interval.to_bed_string(), "chr1\t100\t200");
557
558 let parsed = GenomicInterval::from_bed_string("chr1\t100\t200").unwrap();
559 assert_eq!(parsed, interval);
560 }
561
562 #[test]
563 fn test_display() {
564 let interval = GenomicInterval::new("chr1", 100, 200);
565 assert_eq!(format!("{}", interval), "chr1:100-200");
566 }
567
568 #[test]
569 fn test_sorting() {
570 let mut intervals = vec![
571 GenomicInterval::new("chr2", 100, 200),
572 GenomicInterval::new("chr1", 300, 400),
573 GenomicInterval::new("chr1", 100, 200),
574 ];
575
576 intervals.sort();
577
578 assert_eq!(intervals[0].chrom, "chr1");
579 assert_eq!(intervals[0].start, 100);
580 assert_eq!(intervals[1].chrom, "chr1");
581 assert_eq!(intervals[1].start, 300);
582 assert_eq!(intervals[2].chrom, "chr2");
583 }
584}