use crate::util::{get_buf, Strand};
use math::{
partition::integer_interval_map::IntegerIntervalMap,
set::{
contiguous_integer_set::ContiguousIntegerSet,
ordered_integer_set::OrderedIntegerSet,
},
traits::ToIterator,
};
use num::Float;
use std::{
collections::HashMap,
fmt::Debug,
fs::File,
io::{BufRead, BufReader},
marker::PhantomData,
str::FromStr,
};
pub mod bed_writer;
pub mod paired_end_collator;
use crate::iter::{ChromIntervalValue, ToChromIntervalValueIter};
pub use bed_writer::BedWriter;
pub struct Bed {
filepath: String,
binarize_score: bool,
}
impl Bed {
pub fn new(filepath: &str, use_binary_score: bool) -> Bed {
Bed {
filepath: filepath.to_string(),
binarize_score: use_binary_score,
}
}
#[inline]
pub fn get_filepath(&self) -> &str {
&self.filepath
}
pub fn get_chrom_to_intervals(
&self,
) -> HashMap<Chrom, OrderedIntegerSet<Coordinate>> {
let mut chrom_to_interval_map = HashMap::new();
for (chrom, start, end) in self.to_coord_iter() {
let interval_map = chrom_to_interval_map
.entry(chrom)
.or_insert_with(IntegerIntervalMap::new);
interval_map
.aggregate(ContiguousIntegerSet::new(start, end - 1), 1);
}
chrom_to_interval_map
.into_iter()
.map(|(chrom, interval_map)| {
let intervals: Vec<ContiguousIntegerSet<Coordinate>> =
interval_map
.into_map()
.into_iter()
.map(|(k, _)| k)
.collect();
(chrom, OrderedIntegerSet::from(intervals))
})
.collect()
}
pub fn to_coord_iter(&self) -> BedCoordinateIter {
BedCoordinateIter {
buf: get_buf(&self.filepath).unwrap(),
filename: self.filepath.clone(),
}
}
}
impl<D, E>
ToIterator<'_, BedDataLineIter<D>, <BedDataLineIter<D> as Iterator>::Item>
for Bed
where
D: Float + FromStr<Err = E>,
E: Debug,
{
fn to_iter(&self) -> BedDataLineIter<D> {
BedDataLineIter {
buf: get_buf(&self.filepath).unwrap(),
filename: self.filepath.clone(),
binarize_score: self.binarize_score,
phantom: PhantomData,
}
}
}
impl<V: Float + FromStr<Err = E>, E: Debug>
ToChromIntervalValueIter<BedDataLineIter<V>, BedDataLine<V>, Coordinate, V>
for Bed
{
fn to_chrom_interval_value_iter(&self) -> BedDataLineIter<V> {
self.to_iter()
}
}
impl<V: Copy + Float + FromStr<Err = E>, E: Debug>
ChromIntervalValue<Coordinate, V> for BedDataLine<V>
{
fn chrom_interval_value(
&self,
) -> (Chrom, ContiguousIntegerSet<Coordinate>, V) {
let score = self.score.expect("BED file is missing the score field");
(
self.chrom.clone(),
ContiguousIntegerSet::new(self.start, self.end - 1),
score,
)
}
}
pub type Coordinate = i64;
pub type Chrom = String;
#[derive(Debug, Eq, PartialEq, Clone)]
pub struct BedDataLine<D> {
pub chrom: Chrom,
pub start: Coordinate,
pub end: Coordinate,
pub name: Option<String>,
pub score: Option<D>,
pub strand: Option<Strand>,
}
pub struct BedDataLineIter<D> {
buf: BufReader<File>,
filename: String,
binarize_score: bool,
phantom: PhantomData<D>,
}
impl<D> BedDataLineIter<D> {
pub fn get_filename(&self) -> &str {
&self.filename
}
}
impl<D: Float + FromStr<Err = E>, E: Debug> Iterator for BedDataLineIter<D> {
type Item = BedDataLine<D>;
fn next(&mut self) -> Option<Self::Item> {
loop {
let mut line = String::new();
return if self.buf.read_line(&mut line).unwrap() == 0 {
None
} else {
let mut toks = line.split_whitespace();
let chrom = {
let chrom = toks.next().unwrap();
if chrom.starts_with('#') || chrom == "track" {
continue;
}
chrom.to_string()
};
let start = toks.next().unwrap().parse::<Coordinate>().unwrap();
let end = toks.next().unwrap().parse::<Coordinate>().unwrap();
let name = toks.next().map(|name| name.to_string());
let score = if self.binarize_score {
toks.next();
Some(D::one())
} else {
toks.next().map(|score| score.parse::<D>().unwrap())
};
let strand = toks.next().and_then(|strand| {
Strand::new(strand)
.expect("failed to parse the strand symbol")
});
Some(BedDataLine {
chrom,
start,
end,
name,
score,
strand,
})
};
}
}
}
pub struct BedCoordinateIter {
buf: BufReader<File>,
filename: String,
}
impl BedCoordinateIter {
pub fn get_filename(&self) -> &str {
&self.filename
}
}
impl Iterator for BedCoordinateIter {
type Item = (Chrom, Coordinate, Coordinate);
fn next(&mut self) -> Option<Self::Item> {
loop {
let mut line = String::new();
return if self.buf.read_line(&mut line).unwrap() == 0 {
None
} else {
let mut toks = line.split_whitespace();
let chrom = {
let chrom = toks.next().unwrap();
if chrom.starts_with('#') || chrom == "track" {
continue;
}
chrom.to_string()
};
let start = toks.next().unwrap().parse::<Coordinate>().unwrap();
let end = toks.next().unwrap().parse::<Coordinate>().unwrap();
Some((chrom, start, end))
};
}
}
}
#[cfg(test)]
mod tests {
use crate::{
bed::{Bed, Chrom, Coordinate},
iter::{ChromIntervalValue, ToChromIntervalValueIter},
};
use math::{
partition::integer_interval_map::IntegerIntervalMap,
set::{
contiguous_integer_set::ContiguousIntegerSet,
ordered_integer_set::OrderedIntegerSet,
},
};
use std::{
collections::HashMap,
io::{BufWriter, Write},
};
use tempfile::NamedTempFile;
#[test]
fn test_get_chrom_to_interval_to_val() {
let file = NamedTempFile::new().unwrap();
{
let mut writer = BufWriter::new(&file);
writer
.write_fmt(format_args!(
"chr1 100 200 name_1 3.5\n\
chr1 150 250 name_2 2\n\
chr1 200 350 name_3 4.0\n\
chr3 1000 3000 name_4 -0.3\n\
chr1 400 450 name_5 -0.9\n\
chr3 2500 3000 name_6 0.3\n"
))
.unwrap();
}
let bed = Bed::new(file.path().to_str().unwrap(), false);
{
let chrom_to_interval_to_val =
ToChromIntervalValueIter::get_chrom_to_interval_to_val(
&bed, None,
)
.unwrap();
{
let mut expected_chr1 = IntegerIntervalMap::<f64>::new();
expected_chr1
.aggregate(ContiguousIntegerSet::new(100, 149), 3.5);
expected_chr1
.aggregate(ContiguousIntegerSet::new(150, 199), 5.5);
expected_chr1
.aggregate(ContiguousIntegerSet::new(200, 249), 6.);
expected_chr1
.aggregate(ContiguousIntegerSet::new(250, 349), 4.);
expected_chr1
.aggregate(ContiguousIntegerSet::new(400, 449), -0.9);
assert_eq!(chrom_to_interval_to_val["chr1"], expected_chr1);
}
{
let mut expected_chr3 = IntegerIntervalMap::<f64>::new();
expected_chr3
.aggregate(ContiguousIntegerSet::new(1000, 2499), -0.3);
expected_chr3
.aggregate(ContiguousIntegerSet::new(2500, 2999), 0.);
assert_eq!(chrom_to_interval_to_val["chr3"], expected_chr3);
}
}
{
let exclude: HashMap<Chrom, OrderedIntegerSet<Coordinate>> = [
(
"chr1".into(),
OrderedIntegerSet::from_slice(&[[80, 100], [190, 220]]),
),
(
"chr3".into(),
OrderedIntegerSet::from_slice(&[[1010, 1020]]),
),
]
.iter()
.cloned()
.collect();
let chrom_to_interval_to_val =
ToChromIntervalValueIter::get_chrom_to_interval_to_val(
&bed,
Some(exclude).as_ref(),
)
.unwrap();
{
let mut expected_chr1 = IntegerIntervalMap::<f64>::new();
expected_chr1
.aggregate(ContiguousIntegerSet::new(400, 449), -0.9);
assert_eq!(chrom_to_interval_to_val["chr1"], expected_chr1);
}
{
let mut expected_chr3 = IntegerIntervalMap::<f64>::new();
expected_chr3
.aggregate(ContiguousIntegerSet::new(2500, 2999), 0.3);
assert_eq!(chrom_to_interval_to_val["chr3"], expected_chr3);
}
}
}
#[test]
fn test_get_chrom_to_intervals() {
let file = NamedTempFile::new().unwrap();
{
let mut writer = BufWriter::new(&file);
writer
.write_fmt(format_args!(
"chr1 100 200 name_1 3.5\n\
chr1 150 250 name_2 2\n\
chr1 200 350 name_3 4.0\n\
chr3 1000 3000 name_4 -0.3\n\
chr1 400 450 name_5 -0.9\n\
chr3 2500 3000 name_6 0.3\n"
))
.unwrap();
}
let bed = Bed::new(file.path().to_str().unwrap(), false);
let chrom_to_intervals = bed.get_chrom_to_intervals();
let expected: HashMap<Chrom, OrderedIntegerSet<Coordinate>> = [
(
"chr1".into(),
OrderedIntegerSet::from_slice(&[[100, 349], [400, 449]]),
),
(
"chr3".into(),
OrderedIntegerSet::from_slice(&[[1000, 2999]]),
),
]
.iter()
.cloned()
.collect();
assert_eq!(chrom_to_intervals, expected);
}
#[test]
fn test_bed_to_chrom_interval_value_iter() {
let file = NamedTempFile::new().unwrap();
{
let mut writer = BufWriter::new(&file);
writer
.write_fmt(format_args!(
"chr1 100 200 name_1 3.5\n\
chr1 150 250 name_2 2\n\
chr1 200 350 name_3 4.0\n\
chr3 1000 3000 name_4 -0.3\n\
chr1 400 450 name_5 -0.9\n\
chr3 2500 3000 name_6 0.3\n"
))
.unwrap();
}
let bed = Bed::new(file.path().to_str().unwrap(), false);
let mut iter = bed.to_chrom_interval_value_iter();
assert_eq!(
iter.next().unwrap().chrom_interval_value(),
("chr1".to_string(), ContiguousIntegerSet::new(100, 199), 3.5)
);
assert_eq!(
iter.next().unwrap().chrom_interval_value(),
("chr1".to_string(), ContiguousIntegerSet::new(150, 249), 2.)
);
assert_eq!(
iter.next().unwrap().chrom_interval_value(),
("chr1".to_string(), ContiguousIntegerSet::new(200, 349), 4.)
);
assert_eq!(
iter.next().unwrap().chrom_interval_value(),
(
"chr3".to_string(),
ContiguousIntegerSet::new(1000, 2999),
-0.3
)
);
assert_eq!(
iter.next().unwrap().chrom_interval_value(),
(
"chr1".to_string(),
ContiguousIntegerSet::new(400, 449),
-0.9
)
);
assert_eq!(
iter.next().unwrap().chrom_interval_value(),
(
"chr3".to_string(),
ContiguousIntegerSet::new(2500, 2999),
0.3
)
);
assert_eq!(iter.next(), None);
}
}