use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;
use cyanea_core::{CyaneaError, Result};
use cyanea_omics::genomic::{GenomicInterval, Strand};
#[derive(Debug, Clone)]
pub struct BedRecord {
pub interval: GenomicInterval,
pub name: Option<String>,
pub score: Option<u32>,
}
pub fn parse_bed(path: impl AsRef<Path>) -> Result<Vec<BedRecord>> {
let path = path.as_ref();
let file = File::open(path).map_err(|e| {
CyaneaError::Io(std::io::Error::new(
e.kind(),
format!("{}: {}", path.display(), e),
))
})?;
let reader = BufReader::new(file);
let mut data_lines: Vec<(usize, String)> = Vec::new();
for (line_num, line_result) in reader.lines().enumerate() {
let line = line_result.map_err(|e| {
CyaneaError::Io(std::io::Error::new(
e.kind(),
format!("{}: line {}: {}", path.display(), line_num + 1, e),
))
})?;
let trimmed = line.trim().to_string();
if trimmed.is_empty()
|| trimmed.starts_with('#')
|| trimmed.starts_with("track")
|| trimmed.starts_with("browser")
{
continue;
}
data_lines.push((line_num + 1, trimmed));
}
#[cfg(feature = "parallel")]
{
use rayon::prelude::*;
data_lines
.par_iter()
.map(|(line_num, line)| parse_bed_line(line, *line_num, path))
.collect()
}
#[cfg(not(feature = "parallel"))]
data_lines
.iter()
.map(|(line_num, line)| parse_bed_line(line, *line_num, path))
.collect()
}
pub fn parse_bed_intervals(path: impl AsRef<Path>) -> Result<Vec<GenomicInterval>> {
parse_bed(path).map(|records| records.into_iter().map(|r| r.interval).collect())
}
#[derive(Debug, Clone)]
pub struct BedStats {
pub record_count: u64,
pub total_bases: u64,
pub chromosomes: Vec<String>,
}
pub fn bed_stats(path: impl AsRef<Path>) -> Result<BedStats> {
let path = path.as_ref();
let file = File::open(path).map_err(|e| {
CyaneaError::Io(std::io::Error::new(
e.kind(),
format!("{}: {}", path.display(), e),
))
})?;
let reader = BufReader::new(file);
let mut record_count: u64 = 0;
let mut total_bases: u64 = 0;
let mut chroms = Vec::new();
let mut seen_chroms = std::collections::HashSet::new();
for (line_num, line_result) in reader.lines().enumerate() {
let line = line_result.map_err(CyaneaError::Io)?;
let line = line.trim();
if line.is_empty()
|| line.starts_with('#')
|| line.starts_with("track")
|| line.starts_with("browser")
{
continue;
}
let record = parse_bed_line(line, line_num + 1, path)?;
record_count += 1;
total_bases += record.interval.len();
if seen_chroms.insert(record.interval.chrom.clone()) {
chroms.push(record.interval.chrom);
}
}
Ok(BedStats {
record_count,
total_bases,
chromosomes: chroms,
})
}
pub fn parse_bed_str(text: &str) -> Result<Vec<BedRecord>> {
let dummy = Path::new("<string>");
text.lines()
.enumerate()
.filter(|(_, line)| {
let t = line.trim();
!t.is_empty()
&& !t.starts_with('#')
&& !t.starts_with("track")
&& !t.starts_with("browser")
})
.map(|(i, line)| parse_bed_line(line.trim(), i + 1, dummy))
.collect()
}
fn parse_bed_line(line: &str, line_num: usize, path: &Path) -> Result<BedRecord> {
let fields: Vec<&str> = line.split('\t').collect();
if fields.len() < 3 {
return Err(CyaneaError::Parse(format!(
"{}: line {}: expected at least 3 tab-separated columns, found {}",
path.display(),
line_num,
fields.len()
)));
}
let chrom = fields[0].to_string();
let start: u64 = fields[1].parse().map_err(|_| {
CyaneaError::Parse(format!(
"{}: line {}: invalid start coordinate '{}'",
path.display(),
line_num,
fields[1]
))
})?;
let end: u64 = fields[2].parse().map_err(|_| {
CyaneaError::Parse(format!(
"{}: line {}: invalid end coordinate '{}'",
path.display(),
line_num,
fields[2]
))
})?;
if start > end {
return Err(CyaneaError::Parse(format!(
"{}: line {}: start ({}) > end ({})",
path.display(),
line_num,
start,
end
)));
}
let name = fields.get(3).and_then(|s| {
if *s == "." { None } else { Some(s.to_string()) }
});
let score = fields.get(4).and_then(|s| {
if *s == "." { None } else { s.parse().ok() }
});
let strand = fields.get(5).map_or(Strand::Unknown, |s| match *s {
"+" => Strand::Forward,
"-" => Strand::Reverse,
_ => Strand::Unknown,
});
let interval = GenomicInterval::with_strand(chrom, start, end, strand)?;
Ok(BedRecord {
interval,
name,
score,
})
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
fn write_bed(content: &str) -> NamedTempFile {
let mut file = NamedTempFile::with_suffix(".bed").unwrap();
write!(file, "{}", content).unwrap();
file.flush().unwrap();
file
}
#[test]
fn test_parse_bed3() {
let file = write_bed("chr1\t100\t200\nchr1\t300\t400\nchr2\t500\t600\n");
let records = parse_bed(file.path()).unwrap();
assert_eq!(records.len(), 3);
assert_eq!(records[0].interval.chrom, "chr1");
assert_eq!(records[0].interval.start, 100);
assert_eq!(records[0].interval.end, 200);
assert_eq!(records[0].interval.strand, Strand::Unknown);
assert!(records[0].name.is_none());
assert!(records[0].score.is_none());
}
#[test]
fn test_parse_bed6() {
let file = write_bed("chr1\t100\t200\tgene1\t500\t+\nchr1\t300\t400\tgene2\t0\t-\n");
let records = parse_bed(file.path()).unwrap();
assert_eq!(records.len(), 2);
assert_eq!(records[0].name, Some("gene1".to_string()));
assert_eq!(records[0].score, Some(500));
assert_eq!(records[0].interval.strand, Strand::Forward);
assert_eq!(records[1].interval.strand, Strand::Reverse);
}
#[test]
fn test_parse_bed_intervals() {
let file = write_bed("chr1\t0\t100\nchr1\t200\t300\n");
let intervals = parse_bed_intervals(file.path()).unwrap();
assert_eq!(intervals.len(), 2);
assert_eq!(intervals[0].len(), 100);
assert_eq!(intervals[1].start, 200);
}
#[test]
fn test_bed_stats() {
let file = write_bed("chr1\t0\t100\nchr1\t200\t350\nchr2\t0\t50\n");
let stats = bed_stats(file.path()).unwrap();
assert_eq!(stats.record_count, 3);
assert_eq!(stats.total_bases, 300); assert_eq!(stats.chromosomes, vec!["chr1", "chr2"]);
}
#[test]
fn test_bed_skip_headers() {
let file = write_bed(
"# comment\n\
track name=test\n\
browser position chr1:1-1000\n\
chr1\t100\t200\n",
);
let records = parse_bed(file.path()).unwrap();
assert_eq!(records.len(), 1);
}
#[test]
fn test_bed_empty_file() {
let file = write_bed("# header only\n");
let records = parse_bed(file.path()).unwrap();
assert!(records.is_empty());
}
#[test]
fn test_bed_invalid_start_end() {
let file = write_bed("chr1\t200\t100\n");
let result = parse_bed(file.path());
assert!(result.is_err());
}
#[test]
fn test_bed_too_few_columns() {
let file = write_bed("chr1\t100\n");
let result = parse_bed(file.path());
assert!(result.is_err());
}
#[test]
fn test_parse_bed_str() {
let text = "chr1\t100\t200\tgene1\t500\t+\nchr2\t300\t400\n";
let records = super::parse_bed_str(text).unwrap();
assert_eq!(records.len(), 2);
assert_eq!(records[0].interval.chrom, "chr1");
assert_eq!(records[0].name, Some("gene1".to_string()));
assert_eq!(records[1].interval.chrom, "chr2");
}
#[test]
fn test_bed_file_not_found() {
let result = parse_bed("/nonexistent/file.bed");
assert!(result.is_err());
}
}