genomicframe-core 0.2.0

High-performance genomics I/O and interoperability layer
Documentation
//! BED (Browser Extensible Data) format reader
//!
//! BED format represents genomic intervals/regions in a tab-delimited text format.
//! Supports BED3 (minimal), BED6 (standard), and BED12 (full) formats.
//!
//! # Design Philosophy
//!
//! - **Streaming by default**: Records are never buffered unless explicitly requested
//! - **O(1) memory**: Process millions of regions with constant memory
//! - **Lazy evaluation**: Parse regions on-demand as you iterate
//! - **Flexible parsing**: Auto-detect BED3/BED6/BED12 format
//!
//! # BED Format Specification
//!
//! ## BED3 (Minimal - required fields):
//! 1. chrom: Chromosome name (e.g., chr1, chrX)
//! 2. chromStart: Start position (0-based, inclusive)
//! 3. chromEnd: End position (0-based, exclusive)
//!
//! ## BED6 (Standard - adds optional fields):
//! 4. name: Feature name/ID
//! 5. score: Score (0-1000)
//! 6. strand: Strand (+, -, or .)
//!
//! ## BED12 (Full - adds block/exon structure):
//! 7. thickStart: Coding region start
//! 8. thickEnd: Coding region end
//! 9. itemRgb: RGB color (e.g., "255,0,0")
//! 10. blockCount: Number of blocks/exons
//! 11. blockSizes: Comma-separated block sizes
//! 12. blockStarts: Comma-separated block starts (relative to chromStart)
//!
//! # Examples
//!
//! ```no_run
//! use genomicframe_core::formats::bed::BedReader;
//! use genomicframe_core::core::GenomicRecordIterator;
//!
//! // Streaming: O(1) memory, processes one region at a time
//! let mut reader = BedReader::from_path("regions.bed")?;
//!
//! // Iterate through regions
//! while let Some(record) = reader.next_record()? {
//!     println!("{}: {}-{}", record.chrom, record.start, record.end);
//!
//!     if let Some(name) = &record.name {
//!         println!("  Name: {}", name);
//!     }
//! }
//! # Ok::<(), genomicframe_core::error::Error>(())
//! ```

use crate::core::GenomicRecordIterator;
use crate::error::{Error, Result};
use crate::io::Compression;
use flate2::read::MultiGzDecoder;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;

/// BED record (genomic interval)
///
/// Supports BED3, BED6, and BED12 formats through optional fields.
#[derive(Debug, Clone, PartialEq)]
pub struct BedRecord {
    /// Chromosome name (required)
    pub chrom: String,

    /// Start position, 0-based inclusive (required)
    pub start: u64,

    /// End position, 0-based exclusive (required)
    pub end: u64,

    /// Feature name (BED6+)
    pub name: Option<String>,

    /// Score, typically 0-1000 (BED6+)
    pub score: Option<f64>,

    /// Strand: +, -, or . (BED6+)
    pub strand: Option<char>,

    /// Coding region start (BED12)
    pub thick_start: Option<u64>,

    /// Coding region end (BED12)
    pub thick_end: Option<u64>,

    /// RGB color string (BED12)
    pub item_rgb: Option<String>,

    /// Number of blocks/exons (BED12)
    pub block_count: Option<u32>,

    /// Comma-separated block sizes (BED12)
    pub block_sizes: Option<Vec<u32>>,

    /// Comma-separated block starts, relative to chromStart (BED12)
    pub block_starts: Option<Vec<u32>>,
}

impl BedRecord {
    /// Get the length of this interval
    pub fn len(&self) -> u64 {
        self.end.saturating_sub(self.start)
    }

    /// Check if this is an empty interval (start == end)
    pub fn is_empty(&self) -> bool {
        self.start >= self.end
    }

    /// Get the strand as a string ("+", "-", or ".")
    pub fn strand_str(&self) -> &str {
        match self.strand {
            Some('+') => "+",
            Some('-') => "-",
            _ => ".",
        }
    }

    /// Check if this record has BED6+ fields
    pub fn is_bed6(&self) -> bool {
        self.name.is_some() && self.score.is_some() && self.strand.is_some()
    }

    /// Check if this record has BED12 fields
    pub fn is_bed12(&self) -> bool {
        self.thick_start.is_some()
            && self.thick_end.is_some()
            && self.block_count.is_some()
    }
}

/// BED file reader (streaming, memory-efficient)
pub struct BedReader {
    reader: Box<dyn BufRead>,
    line_buffer: String,
    line_number: usize,
}

impl BedReader {
    /// Open a BED file from a path - handles .bed and .bed.gz automatically
    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
        let path = path.as_ref();
        let file = File::open(path)?;
        let compression = Compression::from_path(path);

        let reader: Box<dyn BufRead> = match compression {
            Compression::Gzip => Box::new(BufReader::new(MultiGzDecoder::new(file))),
            _ => Box::new(BufReader::new(file)),
        };

        Ok(Self {
            reader,
            line_buffer: String::with_capacity(256),
            line_number: 0,
        })
    }

    /// Create a new BED reader from any BufRead source
    pub fn new<R: BufRead + 'static>(reader: R) -> Self {
        Self {
            reader: Box::new(reader),
            line_buffer: String::with_capacity(256),
            line_number: 0,
        }
    }

    /// Parse a BED record line
    fn parse_record(&mut self, line: &str) -> Result<BedRecord> {
        let fields: Vec<&str> = line.split('\t').collect();

        if fields.len() < 3 {
            return Err(Error::Parse(format!(
                "Line {}: BED record must have at least 3 fields (chrom, start, end), got {}",
                self.line_number,
                fields.len()
            )));
        }

        // Required fields (BED3)
        let chrom = fields[0].to_string();
        let start = fields[1].parse::<u64>().map_err(|_| {
            Error::Parse(format!(
                "Line {}: Invalid start position: {}",
                self.line_number, fields[1]
            ))
        })?;
        let end = fields[2].parse::<u64>().map_err(|_| {
            Error::Parse(format!(
                "Line {}: Invalid end position: {}",
                self.line_number, fields[2]
            ))
        })?;

        // Validate interval
        if start > end {
            return Err(Error::Parse(format!(
                "Line {}: Invalid interval: start ({}) > end ({})",
                self.line_number, start, end
            )));
        }

        // Optional fields (BED6)
        let name = fields.get(3).map(|s| s.to_string());
        let score = fields.get(4).and_then(|s| s.parse::<f64>().ok());
        let strand = fields.get(5).and_then(|s| {
            let c = s.chars().next()?;
            if c == '+' || c == '-' || c == '.' {
                Some(c)
            } else {
                None
            }
        });

        // Optional fields (BED12)
        let thick_start = fields.get(6).and_then(|s| s.parse::<u64>().ok());
        let thick_end = fields.get(7).and_then(|s| s.parse::<u64>().ok());
        let item_rgb = fields.get(8).map(|s| s.to_string());
        let block_count = fields.get(9).and_then(|s| s.parse::<u32>().ok());

        let block_sizes = fields.get(10).map(|s| {
            s.trim_end_matches(',')
                .split(',')
                .filter_map(|x| x.parse::<u32>().ok())
                .collect()
        });

        let block_starts = fields.get(11).map(|s| {
            s.trim_end_matches(',')
                .split(',')
                .filter_map(|x| x.parse::<u32>().ok())
                .collect()
        });

        Ok(BedRecord {
            chrom,
            start,
            end,
            name,
            score,
            strand,
            thick_start,
            thick_end,
            item_rgb,
            block_count,
            block_sizes,
            block_starts,
        })
    }
}

impl GenomicRecordIterator for BedReader {
    type Record = BedRecord;

    fn next_raw(&mut self) -> Result<Option<Vec<u8>>> {
        loop {
            self.line_buffer.clear();
            let bytes = self.reader.read_line(&mut self.line_buffer)?;

            if bytes == 0 {
                return Ok(None);
            }

            self.line_number += 1;
            let line = self.line_buffer.trim();

            // Skip empty lines and comments
            if line.is_empty() || line.starts_with('#') || line.starts_with("track") || line.starts_with("browser") {
                continue;
            }

            return Ok(Some(line.as_bytes().to_vec()));
        }
    }

    fn next_record(&mut self) -> Result<Option<Self::Record>> {
        loop {
            self.line_buffer.clear();
            let bytes = self.reader.read_line(&mut self.line_buffer)?;

            if bytes == 0 {
                return Ok(None);
            }

            self.line_number += 1;
            let line = self.line_buffer.trim().to_string();

            // Skip empty lines and comments
            if line.is_empty() || line.starts_with('#') || line.starts_with("track") || line.starts_with("browser") {
                continue;
            }

            return Ok(Some(self.parse_record(&line)?));
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_bed3_parsing() {
        let data = "chr1\t1000\t2000\n";
        let mut reader = BedReader::new(data.as_bytes());

        let record = reader.next_record().unwrap().unwrap();
        assert_eq!(record.chrom, "chr1");
        assert_eq!(record.start, 1000);
        assert_eq!(record.end, 2000);
        assert_eq!(record.len(), 1000);
        assert!(record.name.is_none());
    }

    #[test]
    fn test_bed6_parsing() {
        let data = "chr1\t1000\t2000\tgene1\t100\t+\n";
        let mut reader = BedReader::new(data.as_bytes());

        let record = reader.next_record().unwrap().unwrap();
        assert_eq!(record.chrom, "chr1");
        assert_eq!(record.start, 1000);
        assert_eq!(record.end, 2000);
        assert_eq!(record.name, Some("gene1".to_string()));
        assert_eq!(record.score, Some(100.0));
        assert_eq!(record.strand, Some('+'));
        assert!(record.is_bed6());
    }

    #[test]
    fn test_bed12_parsing() {
        let data = "chr1\t1000\t5000\tgene1\t100\t+\t1200\t4800\t255,0,0\t2\t1000,800\t0,3200\n";
        let mut reader = BedReader::new(data.as_bytes());

        let record = reader.next_record().unwrap().unwrap();
        assert_eq!(record.chrom, "chr1");
        assert_eq!(record.thick_start, Some(1200));
        assert_eq!(record.thick_end, Some(4800));
        assert_eq!(record.block_count, Some(2));
        assert_eq!(record.block_sizes, Some(vec![1000, 800]));
        assert_eq!(record.block_starts, Some(vec![0, 3200]));
        assert!(record.is_bed12());
    }

    #[test]
    fn test_skip_comments() {
        let data = "# This is a comment\ntrack name=test\nchr1\t1000\t2000\n";
        let mut reader = BedReader::new(data.as_bytes());

        let record = reader.next_record().unwrap().unwrap();
        assert_eq!(record.chrom, "chr1");
    }

    #[test]
    fn test_invalid_interval() {
        let data = "chr1\t2000\t1000\n"; // start > end
        let mut reader = BedReader::new(data.as_bytes());

        assert!(reader.next_record().is_err());
    }

    #[test]
    fn test_strand_str() {
        let record = BedRecord {
            chrom: "chr1".to_string(),
            start: 1000,
            end: 2000,
            name: None,
            score: None,
            strand: Some('+'),
            thick_start: None,
            thick_end: None,
            item_rgb: None,
            block_count: None,
            block_sizes: None,
            block_starts: None,
        };

        assert_eq!(record.strand_str(), "+");
    }
}