genomicframe-core 0.2.0

High-performance genomics I/O and interoperability layer
Documentation
//! Core types and traits for genomic data representation
//!
//! This module defines the fundamental building blocks for representing
//! genomic data in a memory-efficient, composable way.

use crate::error::{Error, Result};

/// Represents a genomic position (chromosome + coordinate)
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct GenomicPosition {
    /// Chromosome name (e.g., "chr1", "X", "MT")
    pub chrom: String,
    /// 1-based genomic coordinate
    pub pos: u64,
}

impl GenomicPosition {
    /// Create a new genomic position
    pub fn new(chrom: impl Into<String>, pos: u64) -> Self {
        Self {
            chrom: chrom.into(),
            pos,
        }
    }
}

/// Represents a genomic interval (chromosome + start + end)
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct GenomicInterval {
    /// Chromosome name
    pub chrom: String,
    /// Start position (0-based, inclusive)
    pub start: u64,
    /// End position (0-based, exclusive)
    pub end: u64,
}

impl GenomicInterval {
    /// Create a new genomic interval
    pub fn new(chrom: impl Into<String>, start: u64, end: u64) -> Result<Self> {
        let chrom = chrom.into();
        if start > end {
            return Err(Error::InvalidInput(format!(
                "Start position {} is greater than end position {}",
                start, end
            )));
        }
        Ok(Self { chrom, start, end })
    }

    /// Check if this interval contains a position
    pub fn contains(&self, pos: &GenomicPosition) -> bool {
        self.chrom == pos.chrom && self.start < pos.pos && pos.pos <= self.end
    }

    /// Calculate the length of this interval
    pub fn len(&self) -> u64 {
        self.end - self.start
    }

    /// Check if this interval is empty
    pub fn is_empty(&self) -> bool {
        self.start == self.end
    }
}

/// Trait for types that can be iterated over genomic records
///
/// This is the core streaming interface - records are produced lazily
/// and never buffered unless explicitly requested by the user.
pub trait GenomicRecordIterator {
    /// The record type yielded by this iterator
    type Record;

    /// Advance to the next record
    ///
    /// Returns `Ok(None)` at EOF, `Ok(Some(record))` for each record,
    /// or `Err` on parse/IO errors.
    fn next_record(&mut self) -> Result<Option<Self::Record>>;

    /// Advance to the next raw record
    fn next_raw(&mut self) -> Result<Option<Vec<u8>>>;

    /// Consume the iterator and collect all records into a vector
    ///
    /// **WARNING**: This loads the entire file into memory.
    /// Only use for small files or when you explicitly need all data in RAM.
    /// For large files, iterate with `next_record()` instead.
    fn collect_all(mut self) -> Result<Vec<Self::Record>>
    where
        Self: Sized,
    {
        let mut records = Vec::new();
        while let Some(record) = self.next_record()? {
            records.push(record);
        }
        Ok(records)
    }

    /// Process records in chunks for memory-efficient batch operations
    ///
    /// This is the preferred way to process large files - iterate in
    /// fixed-size chunks, process each chunk, then discard it.
    fn chunks(self, chunk_size: usize) -> ChunkedIterator<Self>
    where
        Self: Sized,
    {
        ChunkedIterator {
            inner: self,
            chunk_size,
        }
    }
}

/// Chunked iterator for batch processing
pub struct ChunkedIterator<I> {
    inner: I,
    chunk_size: usize,
}

impl<I> Iterator for ChunkedIterator<I>
where
    I: GenomicRecordIterator,
{
    type Item = Result<Vec<I::Record>>;

    fn next(&mut self) -> Option<Self::Item> {
        let mut chunk = Vec::with_capacity(self.chunk_size);

        for _ in 0..self.chunk_size {
            match self.inner.next_record() {
                Ok(Some(record)) => chunk.push(record),
                Ok(None) => break,
                Err(e) => return Some(Err(e)),
            }
        }

        if chunk.is_empty() {
            None
        } else {
            Some(Ok(chunk))
        }
    }
}

/// Trait for genomic file readers with metadata access
pub trait GenomicReader: GenomicRecordIterator {
    /// Metadata type for this reader
    type Metadata;

    /// Get metadata about the genomic data (headers, contigs, etc.)
    fn metadata(&self) -> &Self::Metadata;
}

/// Trait for genomic file writers
pub trait GenomicWriter {
    /// The record type written by this writer
    type Record;

    /// Write a single record
    fn write_record(&mut self, record: &Self::Record) -> Result<()>;

    /// Write multiple records
    fn write_records(&mut self, records: &[Self::Record]) -> Result<()> {
        for record in records {
            self.write_record(record)?;
        }
        Ok(())
    }

    /// Flush any buffered data
    fn flush(&mut self) -> Result<()>;
}

/// Strand orientation
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum Strand {
    /// Forward strand (+)
    Forward,
    /// Reverse strand (-)
    Reverse,
    /// Unknown or unstranded
    Unknown,
}

impl Strand {
    /// Parse strand from a character
    pub fn from_char(c: char) -> Self {
        match c {
            '+' => Strand::Forward,
            '-' => Strand::Reverse,
            _ => Strand::Unknown,
        }
    }

    /// Convert strand to a character
    pub fn to_char(self) -> char {
        match self {
            Strand::Forward => '+',
            Strand::Reverse => '-',
            Strand::Unknown => '.',
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_genomic_position() {
        let pos = GenomicPosition::new("chr1", 12345);
        assert_eq!(pos.chrom, "chr1");
        assert_eq!(pos.pos, 12345);
    }

    #[test]
    fn test_genomic_interval() {
        let interval = GenomicInterval::new("chr1", 1000, 2000).unwrap();
        assert_eq!(interval.len(), 1000);
        assert!(!interval.is_empty());

        let pos_inside = GenomicPosition::new("chr1", 1500);
        let pos_outside = GenomicPosition::new("chr1", 3000);
        assert!(interval.contains(&pos_inside));
        assert!(!interval.contains(&pos_outside));
    }

    #[test]
    fn test_invalid_interval() {
        let result = GenomicInterval::new("chr1", 2000, 1000);
        assert!(result.is_err());
    }

    #[test]
    fn test_strand() {
        assert_eq!(Strand::from_char('+'), Strand::Forward);
        assert_eq!(Strand::from_char('-'), Strand::Reverse);
        assert_eq!(Strand::from_char('.'), Strand::Unknown);
        assert_eq!(Strand::Forward.to_char(), '+');
    }
}