Skip to main content

genomicframe_core/formats/bed/
reader.rs

1//! BED (Browser Extensible Data) format reader
2//!
3//! BED format represents genomic intervals/regions in a tab-delimited text format.
4//! Supports BED3 (minimal), BED6 (standard), and BED12 (full) formats.
5//!
6//! # Design Philosophy
7//!
8//! - **Streaming by default**: Records are never buffered unless explicitly requested
9//! - **O(1) memory**: Process millions of regions with constant memory
10//! - **Lazy evaluation**: Parse regions on-demand as you iterate
11//! - **Flexible parsing**: Auto-detect BED3/BED6/BED12 format
12//!
13//! # BED Format Specification
14//!
15//! ## BED3 (Minimal - required fields):
16//! 1. chrom: Chromosome name (e.g., chr1, chrX)
17//! 2. chromStart: Start position (0-based, inclusive)
18//! 3. chromEnd: End position (0-based, exclusive)
19//!
20//! ## BED6 (Standard - adds optional fields):
21//! 4. name: Feature name/ID
22//! 5. score: Score (0-1000)
23//! 6. strand: Strand (+, -, or .)
24//!
25//! ## BED12 (Full - adds block/exon structure):
26//! 7. thickStart: Coding region start
27//! 8. thickEnd: Coding region end
28//! 9. itemRgb: RGB color (e.g., "255,0,0")
29//! 10. blockCount: Number of blocks/exons
30//! 11. blockSizes: Comma-separated block sizes
31//! 12. blockStarts: Comma-separated block starts (relative to chromStart)
32//!
33//! # Examples
34//!
35//! ```no_run
36//! use genomicframe_core::formats::bed::BedReader;
37//! use genomicframe_core::core::GenomicRecordIterator;
38//!
39//! // Streaming: O(1) memory, processes one region at a time
40//! let mut reader = BedReader::from_path("regions.bed")?;
41//!
42//! // Iterate through regions
43//! while let Some(record) = reader.next_record()? {
44//!     println!("{}: {}-{}", record.chrom, record.start, record.end);
45//!
46//!     if let Some(name) = &record.name {
47//!         println!("  Name: {}", name);
48//!     }
49//! }
50//! # Ok::<(), genomicframe_core::error::Error>(())
51//! ```
52
53use crate::core::GenomicRecordIterator;
54use crate::error::{Error, Result};
55use crate::io::Compression;
56use flate2::read::MultiGzDecoder;
57use std::fs::File;
58use std::io::{BufRead, BufReader};
59use std::path::Path;
60
61/// BED record (genomic interval)
62///
63/// Supports BED3, BED6, and BED12 formats through optional fields.
64#[derive(Debug, Clone, PartialEq)]
65pub struct BedRecord {
66    /// Chromosome name (required)
67    pub chrom: String,
68
69    /// Start position, 0-based inclusive (required)
70    pub start: u64,
71
72    /// End position, 0-based exclusive (required)
73    pub end: u64,
74
75    /// Feature name (BED6+)
76    pub name: Option<String>,
77
78    /// Score, typically 0-1000 (BED6+)
79    pub score: Option<f64>,
80
81    /// Strand: +, -, or . (BED6+)
82    pub strand: Option<char>,
83
84    /// Coding region start (BED12)
85    pub thick_start: Option<u64>,
86
87    /// Coding region end (BED12)
88    pub thick_end: Option<u64>,
89
90    /// RGB color string (BED12)
91    pub item_rgb: Option<String>,
92
93    /// Number of blocks/exons (BED12)
94    pub block_count: Option<u32>,
95
96    /// Comma-separated block sizes (BED12)
97    pub block_sizes: Option<Vec<u32>>,
98
99    /// Comma-separated block starts, relative to chromStart (BED12)
100    pub block_starts: Option<Vec<u32>>,
101}
102
103impl BedRecord {
104    /// Get the length of this interval
105    pub fn len(&self) -> u64 {
106        self.end.saturating_sub(self.start)
107    }
108
109    /// Check if this is an empty interval (start == end)
110    pub fn is_empty(&self) -> bool {
111        self.start >= self.end
112    }
113
114    /// Get the strand as a string ("+", "-", or ".")
115    pub fn strand_str(&self) -> &str {
116        match self.strand {
117            Some('+') => "+",
118            Some('-') => "-",
119            _ => ".",
120        }
121    }
122
123    /// Check if this record has BED6+ fields
124    pub fn is_bed6(&self) -> bool {
125        self.name.is_some() && self.score.is_some() && self.strand.is_some()
126    }
127
128    /// Check if this record has BED12 fields
129    pub fn is_bed12(&self) -> bool {
130        self.thick_start.is_some()
131            && self.thick_end.is_some()
132            && self.block_count.is_some()
133    }
134}
135
136/// BED file reader (streaming, memory-efficient)
137pub struct BedReader {
138    reader: Box<dyn BufRead>,
139    line_buffer: String,
140    line_number: usize,
141}
142
143impl BedReader {
144    /// Open a BED file from a path - handles .bed and .bed.gz automatically
145    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
146        let path = path.as_ref();
147        let file = File::open(path)?;
148        let compression = Compression::from_path(path);
149
150        let reader: Box<dyn BufRead> = match compression {
151            Compression::Gzip => Box::new(BufReader::new(MultiGzDecoder::new(file))),
152            _ => Box::new(BufReader::new(file)),
153        };
154
155        Ok(Self {
156            reader,
157            line_buffer: String::with_capacity(256),
158            line_number: 0,
159        })
160    }
161
162    /// Create a new BED reader from any BufRead source
163    pub fn new<R: BufRead + 'static>(reader: R) -> Self {
164        Self {
165            reader: Box::new(reader),
166            line_buffer: String::with_capacity(256),
167            line_number: 0,
168        }
169    }
170
171    /// Parse a BED record line
172    fn parse_record(&mut self, line: &str) -> Result<BedRecord> {
173        let fields: Vec<&str> = line.split('\t').collect();
174
175        if fields.len() < 3 {
176            return Err(Error::Parse(format!(
177                "Line {}: BED record must have at least 3 fields (chrom, start, end), got {}",
178                self.line_number,
179                fields.len()
180            )));
181        }
182
183        // Required fields (BED3)
184        let chrom = fields[0].to_string();
185        let start = fields[1].parse::<u64>().map_err(|_| {
186            Error::Parse(format!(
187                "Line {}: Invalid start position: {}",
188                self.line_number, fields[1]
189            ))
190        })?;
191        let end = fields[2].parse::<u64>().map_err(|_| {
192            Error::Parse(format!(
193                "Line {}: Invalid end position: {}",
194                self.line_number, fields[2]
195            ))
196        })?;
197
198        // Validate interval
199        if start > end {
200            return Err(Error::Parse(format!(
201                "Line {}: Invalid interval: start ({}) > end ({})",
202                self.line_number, start, end
203            )));
204        }
205
206        // Optional fields (BED6)
207        let name = fields.get(3).map(|s| s.to_string());
208        let score = fields.get(4).and_then(|s| s.parse::<f64>().ok());
209        let strand = fields.get(5).and_then(|s| {
210            let c = s.chars().next()?;
211            if c == '+' || c == '-' || c == '.' {
212                Some(c)
213            } else {
214                None
215            }
216        });
217
218        // Optional fields (BED12)
219        let thick_start = fields.get(6).and_then(|s| s.parse::<u64>().ok());
220        let thick_end = fields.get(7).and_then(|s| s.parse::<u64>().ok());
221        let item_rgb = fields.get(8).map(|s| s.to_string());
222        let block_count = fields.get(9).and_then(|s| s.parse::<u32>().ok());
223
224        let block_sizes = fields.get(10).map(|s| {
225            s.trim_end_matches(',')
226                .split(',')
227                .filter_map(|x| x.parse::<u32>().ok())
228                .collect()
229        });
230
231        let block_starts = fields.get(11).map(|s| {
232            s.trim_end_matches(',')
233                .split(',')
234                .filter_map(|x| x.parse::<u32>().ok())
235                .collect()
236        });
237
238        Ok(BedRecord {
239            chrom,
240            start,
241            end,
242            name,
243            score,
244            strand,
245            thick_start,
246            thick_end,
247            item_rgb,
248            block_count,
249            block_sizes,
250            block_starts,
251        })
252    }
253}
254
255impl GenomicRecordIterator for BedReader {
256    type Record = BedRecord;
257
258    fn next_raw(&mut self) -> Result<Option<Vec<u8>>> {
259        loop {
260            self.line_buffer.clear();
261            let bytes = self.reader.read_line(&mut self.line_buffer)?;
262
263            if bytes == 0 {
264                return Ok(None);
265            }
266
267            self.line_number += 1;
268            let line = self.line_buffer.trim();
269
270            // Skip empty lines and comments
271            if line.is_empty() || line.starts_with('#') || line.starts_with("track") || line.starts_with("browser") {
272                continue;
273            }
274
275            return Ok(Some(line.as_bytes().to_vec()));
276        }
277    }
278
279    fn next_record(&mut self) -> Result<Option<Self::Record>> {
280        loop {
281            self.line_buffer.clear();
282            let bytes = self.reader.read_line(&mut self.line_buffer)?;
283
284            if bytes == 0 {
285                return Ok(None);
286            }
287
288            self.line_number += 1;
289            let line = self.line_buffer.trim().to_string();
290
291            // Skip empty lines and comments
292            if line.is_empty() || line.starts_with('#') || line.starts_with("track") || line.starts_with("browser") {
293                continue;
294            }
295
296            return Ok(Some(self.parse_record(&line)?));
297        }
298    }
299}
300
301#[cfg(test)]
302mod tests {
303    use super::*;
304
305    #[test]
306    fn test_bed3_parsing() {
307        let data = "chr1\t1000\t2000\n";
308        let mut reader = BedReader::new(data.as_bytes());
309
310        let record = reader.next_record().unwrap().unwrap();
311        assert_eq!(record.chrom, "chr1");
312        assert_eq!(record.start, 1000);
313        assert_eq!(record.end, 2000);
314        assert_eq!(record.len(), 1000);
315        assert!(record.name.is_none());
316    }
317
318    #[test]
319    fn test_bed6_parsing() {
320        let data = "chr1\t1000\t2000\tgene1\t100\t+\n";
321        let mut reader = BedReader::new(data.as_bytes());
322
323        let record = reader.next_record().unwrap().unwrap();
324        assert_eq!(record.chrom, "chr1");
325        assert_eq!(record.start, 1000);
326        assert_eq!(record.end, 2000);
327        assert_eq!(record.name, Some("gene1".to_string()));
328        assert_eq!(record.score, Some(100.0));
329        assert_eq!(record.strand, Some('+'));
330        assert!(record.is_bed6());
331    }
332
333    #[test]
334    fn test_bed12_parsing() {
335        let data = "chr1\t1000\t5000\tgene1\t100\t+\t1200\t4800\t255,0,0\t2\t1000,800\t0,3200\n";
336        let mut reader = BedReader::new(data.as_bytes());
337
338        let record = reader.next_record().unwrap().unwrap();
339        assert_eq!(record.chrom, "chr1");
340        assert_eq!(record.thick_start, Some(1200));
341        assert_eq!(record.thick_end, Some(4800));
342        assert_eq!(record.block_count, Some(2));
343        assert_eq!(record.block_sizes, Some(vec![1000, 800]));
344        assert_eq!(record.block_starts, Some(vec![0, 3200]));
345        assert!(record.is_bed12());
346    }
347
348    #[test]
349    fn test_skip_comments() {
350        let data = "# This is a comment\ntrack name=test\nchr1\t1000\t2000\n";
351        let mut reader = BedReader::new(data.as_bytes());
352
353        let record = reader.next_record().unwrap().unwrap();
354        assert_eq!(record.chrom, "chr1");
355    }
356
357    #[test]
358    fn test_invalid_interval() {
359        let data = "chr1\t2000\t1000\n"; // start > end
360        let mut reader = BedReader::new(data.as_bytes());
361
362        assert!(reader.next_record().is_err());
363    }
364
365    #[test]
366    fn test_strand_str() {
367        let record = BedRecord {
368            chrom: "chr1".to_string(),
369            start: 1000,
370            end: 2000,
371            name: None,
372            score: None,
373            strand: Some('+'),
374            thick_start: None,
375            thick_end: None,
376            item_rgb: None,
377            block_count: None,
378            block_sizes: None,
379            block_starts: None,
380        };
381
382        assert_eq!(record.strand_str(), "+");
383    }
384}