Skip to main content

genomicframe_core/
core.rs

1//! Core types and traits for genomic data representation
2//!
3//! This module defines the fundamental building blocks for representing
4//! genomic data in a memory-efficient, composable way.
5
6use crate::error::{Error, Result};
7
8/// Represents a genomic position (chromosome + coordinate)
9#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
10pub struct GenomicPosition {
11    /// Chromosome name (e.g., "chr1", "X", "MT")
12    pub chrom: String,
13    /// 1-based genomic coordinate
14    pub pos: u64,
15}
16
17impl GenomicPosition {
18    /// Create a new genomic position
19    pub fn new(chrom: impl Into<String>, pos: u64) -> Self {
20        Self {
21            chrom: chrom.into(),
22            pos,
23        }
24    }
25}
26
27/// Represents a genomic interval (chromosome + start + end)
28#[derive(Debug, Clone, PartialEq, Eq, Hash)]
29pub struct GenomicInterval {
30    /// Chromosome name
31    pub chrom: String,
32    /// Start position (0-based, inclusive)
33    pub start: u64,
34    /// End position (0-based, exclusive)
35    pub end: u64,
36}
37
38impl GenomicInterval {
39    /// Create a new genomic interval
40    pub fn new(chrom: impl Into<String>, start: u64, end: u64) -> Result<Self> {
41        let chrom = chrom.into();
42        if start > end {
43            return Err(Error::InvalidInput(format!(
44                "Start position {} is greater than end position {}",
45                start, end
46            )));
47        }
48        Ok(Self { chrom, start, end })
49    }
50
51    /// Check if this interval contains a position
52    pub fn contains(&self, pos: &GenomicPosition) -> bool {
53        self.chrom == pos.chrom && self.start < pos.pos && pos.pos <= self.end
54    }
55
56    /// Calculate the length of this interval
57    pub fn len(&self) -> u64 {
58        self.end - self.start
59    }
60
61    /// Check if this interval is empty
62    pub fn is_empty(&self) -> bool {
63        self.start == self.end
64    }
65}
66
67/// Trait for types that can be iterated over genomic records
68///
69/// This is the core streaming interface - records are produced lazily
70/// and never buffered unless explicitly requested by the user.
71pub trait GenomicRecordIterator {
72    /// The record type yielded by this iterator
73    type Record;
74
75    /// Advance to the next record
76    ///
77    /// Returns `Ok(None)` at EOF, `Ok(Some(record))` for each record,
78    /// or `Err` on parse/IO errors.
79    fn next_record(&mut self) -> Result<Option<Self::Record>>;
80
81    /// Advance to the next raw record
82    fn next_raw(&mut self) -> Result<Option<Vec<u8>>>;
83
84    /// Consume the iterator and collect all records into a vector
85    ///
86    /// **WARNING**: This loads the entire file into memory.
87    /// Only use for small files or when you explicitly need all data in RAM.
88    /// For large files, iterate with `next_record()` instead.
89    fn collect_all(mut self) -> Result<Vec<Self::Record>>
90    where
91        Self: Sized,
92    {
93        let mut records = Vec::new();
94        while let Some(record) = self.next_record()? {
95            records.push(record);
96        }
97        Ok(records)
98    }
99
100    /// Process records in chunks for memory-efficient batch operations
101    ///
102    /// This is the preferred way to process large files - iterate in
103    /// fixed-size chunks, process each chunk, then discard it.
104    fn chunks(self, chunk_size: usize) -> ChunkedIterator<Self>
105    where
106        Self: Sized,
107    {
108        ChunkedIterator {
109            inner: self,
110            chunk_size,
111        }
112    }
113}
114
115/// Chunked iterator for batch processing
116pub struct ChunkedIterator<I> {
117    inner: I,
118    chunk_size: usize,
119}
120
121impl<I> Iterator for ChunkedIterator<I>
122where
123    I: GenomicRecordIterator,
124{
125    type Item = Result<Vec<I::Record>>;
126
127    fn next(&mut self) -> Option<Self::Item> {
128        let mut chunk = Vec::with_capacity(self.chunk_size);
129
130        for _ in 0..self.chunk_size {
131            match self.inner.next_record() {
132                Ok(Some(record)) => chunk.push(record),
133                Ok(None) => break,
134                Err(e) => return Some(Err(e)),
135            }
136        }
137
138        if chunk.is_empty() {
139            None
140        } else {
141            Some(Ok(chunk))
142        }
143    }
144}
145
146/// Trait for genomic file readers with metadata access
147pub trait GenomicReader: GenomicRecordIterator {
148    /// Metadata type for this reader
149    type Metadata;
150
151    /// Get metadata about the genomic data (headers, contigs, etc.)
152    fn metadata(&self) -> &Self::Metadata;
153}
154
155/// Trait for genomic file writers
156pub trait GenomicWriter {
157    /// The record type written by this writer
158    type Record;
159
160    /// Write a single record
161    fn write_record(&mut self, record: &Self::Record) -> Result<()>;
162
163    /// Write multiple records
164    fn write_records(&mut self, records: &[Self::Record]) -> Result<()> {
165        for record in records {
166            self.write_record(record)?;
167        }
168        Ok(())
169    }
170
171    /// Flush any buffered data
172    fn flush(&mut self) -> Result<()>;
173}
174
175/// Strand orientation
176#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
177pub enum Strand {
178    /// Forward strand (+)
179    Forward,
180    /// Reverse strand (-)
181    Reverse,
182    /// Unknown or unstranded
183    Unknown,
184}
185
186impl Strand {
187    /// Parse strand from a character
188    pub fn from_char(c: char) -> Self {
189        match c {
190            '+' => Strand::Forward,
191            '-' => Strand::Reverse,
192            _ => Strand::Unknown,
193        }
194    }
195
196    /// Convert strand to a character
197    pub fn to_char(self) -> char {
198        match self {
199            Strand::Forward => '+',
200            Strand::Reverse => '-',
201            Strand::Unknown => '.',
202        }
203    }
204}
205
206#[cfg(test)]
207mod tests {
208    use super::*;
209
210    #[test]
211    fn test_genomic_position() {
212        let pos = GenomicPosition::new("chr1", 12345);
213        assert_eq!(pos.chrom, "chr1");
214        assert_eq!(pos.pos, 12345);
215    }
216
217    #[test]
218    fn test_genomic_interval() {
219        let interval = GenomicInterval::new("chr1", 1000, 2000).unwrap();
220        assert_eq!(interval.len(), 1000);
221        assert!(!interval.is_empty());
222
223        let pos_inside = GenomicPosition::new("chr1", 1500);
224        let pos_outside = GenomicPosition::new("chr1", 3000);
225        assert!(interval.contains(&pos_inside));
226        assert!(!interval.contains(&pos_outside));
227    }
228
229    #[test]
230    fn test_invalid_interval() {
231        let result = GenomicInterval::new("chr1", 2000, 1000);
232        assert!(result.is_err());
233    }
234
235    #[test]
236    fn test_strand() {
237        assert_eq!(Strand::from_char('+'), Strand::Forward);
238        assert_eq!(Strand::from_char('-'), Strand::Reverse);
239        assert_eq!(Strand::from_char('.'), Strand::Unknown);
240        assert_eq!(Strand::Forward.to_char(), '+');
241    }
242}