use crate::core::GenomicRecordIterator;
use crate::error::{Error, Result};
use crate::io::Compression;
use flate2::read::MultiGzDecoder;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;
#[derive(Debug, Clone, PartialEq)]
pub struct BedRecord {
pub chrom: String,
pub start: u64,
pub end: u64,
pub name: Option<String>,
pub score: Option<f64>,
pub strand: Option<char>,
pub thick_start: Option<u64>,
pub thick_end: Option<u64>,
pub item_rgb: Option<String>,
pub block_count: Option<u32>,
pub block_sizes: Option<Vec<u32>>,
pub block_starts: Option<Vec<u32>>,
}
impl BedRecord {
pub fn len(&self) -> u64 {
self.end.saturating_sub(self.start)
}
pub fn is_empty(&self) -> bool {
self.start >= self.end
}
pub fn strand_str(&self) -> &str {
match self.strand {
Some('+') => "+",
Some('-') => "-",
_ => ".",
}
}
pub fn is_bed6(&self) -> bool {
self.name.is_some() && self.score.is_some() && self.strand.is_some()
}
pub fn is_bed12(&self) -> bool {
self.thick_start.is_some()
&& self.thick_end.is_some()
&& self.block_count.is_some()
}
}
pub struct BedReader {
reader: Box<dyn BufRead>,
line_buffer: String,
line_number: usize,
}
impl BedReader {
pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
let path = path.as_ref();
let file = File::open(path)?;
let compression = Compression::from_path(path);
let reader: Box<dyn BufRead> = match compression {
Compression::Gzip => Box::new(BufReader::new(MultiGzDecoder::new(file))),
_ => Box::new(BufReader::new(file)),
};
Ok(Self {
reader,
line_buffer: String::with_capacity(256),
line_number: 0,
})
}
pub fn new<R: BufRead + 'static>(reader: R) -> Self {
Self {
reader: Box::new(reader),
line_buffer: String::with_capacity(256),
line_number: 0,
}
}
fn parse_record(&mut self, line: &str) -> Result<BedRecord> {
let fields: Vec<&str> = line.split('\t').collect();
if fields.len() < 3 {
return Err(Error::Parse(format!(
"Line {}: BED record must have at least 3 fields (chrom, start, end), got {}",
self.line_number,
fields.len()
)));
}
let chrom = fields[0].to_string();
let start = fields[1].parse::<u64>().map_err(|_| {
Error::Parse(format!(
"Line {}: Invalid start position: {}",
self.line_number, fields[1]
))
})?;
let end = fields[2].parse::<u64>().map_err(|_| {
Error::Parse(format!(
"Line {}: Invalid end position: {}",
self.line_number, fields[2]
))
})?;
if start > end {
return Err(Error::Parse(format!(
"Line {}: Invalid interval: start ({}) > end ({})",
self.line_number, start, end
)));
}
let name = fields.get(3).map(|s| s.to_string());
let score = fields.get(4).and_then(|s| s.parse::<f64>().ok());
let strand = fields.get(5).and_then(|s| {
let c = s.chars().next()?;
if c == '+' || c == '-' || c == '.' {
Some(c)
} else {
None
}
});
let thick_start = fields.get(6).and_then(|s| s.parse::<u64>().ok());
let thick_end = fields.get(7).and_then(|s| s.parse::<u64>().ok());
let item_rgb = fields.get(8).map(|s| s.to_string());
let block_count = fields.get(9).and_then(|s| s.parse::<u32>().ok());
let block_sizes = fields.get(10).map(|s| {
s.trim_end_matches(',')
.split(',')
.filter_map(|x| x.parse::<u32>().ok())
.collect()
});
let block_starts = fields.get(11).map(|s| {
s.trim_end_matches(',')
.split(',')
.filter_map(|x| x.parse::<u32>().ok())
.collect()
});
Ok(BedRecord {
chrom,
start,
end,
name,
score,
strand,
thick_start,
thick_end,
item_rgb,
block_count,
block_sizes,
block_starts,
})
}
}
impl GenomicRecordIterator for BedReader {
type Record = BedRecord;
fn next_raw(&mut self) -> Result<Option<Vec<u8>>> {
loop {
self.line_buffer.clear();
let bytes = self.reader.read_line(&mut self.line_buffer)?;
if bytes == 0 {
return Ok(None);
}
self.line_number += 1;
let line = self.line_buffer.trim();
if line.is_empty() || line.starts_with('#') || line.starts_with("track") || line.starts_with("browser") {
continue;
}
return Ok(Some(line.as_bytes().to_vec()));
}
}
fn next_record(&mut self) -> Result<Option<Self::Record>> {
loop {
self.line_buffer.clear();
let bytes = self.reader.read_line(&mut self.line_buffer)?;
if bytes == 0 {
return Ok(None);
}
self.line_number += 1;
let line = self.line_buffer.trim().to_string();
if line.is_empty() || line.starts_with('#') || line.starts_with("track") || line.starts_with("browser") {
continue;
}
return Ok(Some(self.parse_record(&line)?));
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_bed3_parsing() {
let data = "chr1\t1000\t2000\n";
let mut reader = BedReader::new(data.as_bytes());
let record = reader.next_record().unwrap().unwrap();
assert_eq!(record.chrom, "chr1");
assert_eq!(record.start, 1000);
assert_eq!(record.end, 2000);
assert_eq!(record.len(), 1000);
assert!(record.name.is_none());
}
#[test]
fn test_bed6_parsing() {
let data = "chr1\t1000\t2000\tgene1\t100\t+\n";
let mut reader = BedReader::new(data.as_bytes());
let record = reader.next_record().unwrap().unwrap();
assert_eq!(record.chrom, "chr1");
assert_eq!(record.start, 1000);
assert_eq!(record.end, 2000);
assert_eq!(record.name, Some("gene1".to_string()));
assert_eq!(record.score, Some(100.0));
assert_eq!(record.strand, Some('+'));
assert!(record.is_bed6());
}
#[test]
fn test_bed12_parsing() {
let data = "chr1\t1000\t5000\tgene1\t100\t+\t1200\t4800\t255,0,0\t2\t1000,800\t0,3200\n";
let mut reader = BedReader::new(data.as_bytes());
let record = reader.next_record().unwrap().unwrap();
assert_eq!(record.chrom, "chr1");
assert_eq!(record.thick_start, Some(1200));
assert_eq!(record.thick_end, Some(4800));
assert_eq!(record.block_count, Some(2));
assert_eq!(record.block_sizes, Some(vec![1000, 800]));
assert_eq!(record.block_starts, Some(vec![0, 3200]));
assert!(record.is_bed12());
}
#[test]
fn test_skip_comments() {
let data = "# This is a comment\ntrack name=test\nchr1\t1000\t2000\n";
let mut reader = BedReader::new(data.as_bytes());
let record = reader.next_record().unwrap().unwrap();
assert_eq!(record.chrom, "chr1");
}
#[test]
fn test_invalid_interval() {
let data = "chr1\t2000\t1000\n"; let mut reader = BedReader::new(data.as_bytes());
assert!(reader.next_record().is_err());
}
#[test]
fn test_strand_str() {
let record = BedRecord {
chrom: "chr1".to_string(),
start: 1000,
end: 2000,
name: None,
score: None,
strand: Some('+'),
thick_start: None,
thick_end: None,
item_rgb: None,
block_count: None,
block_sizes: None,
block_starts: None,
};
assert_eq!(record.strand_str(), "+");
}
}