use super::myio;
use bio::io::bed;
use lazy_static::lazy_static;
use regex::Regex;
use std::cmp::{max, min};
use std::fmt;
use std::str;
lazy_static! {
static ref BED_RE: Regex = Regex::new(r"([^\s]+)\t([0-9]+)\t([0-9]+)\t?([^\s]+)?.*").unwrap();
static ref RGN_RE: Regex = Regex::new(r"(.+):([0-9]+)-([0-9]+)").unwrap();
}
#[derive(Default)]
pub struct Region {
pub name: String,
pub st: u64,
pub en: u64,
pub id: String,
pub record: bed::Record,
}
impl Region {
pub fn get_column(&self, column: u8) -> String {
match column {
1 => self.name.clone(),
2 => self.st.to_string(),
3 => self.en.to_string(),
4 => self.record.name().unwrap_or("no-value").to_string(),
5 => self.record.score().unwrap_or("no-value").to_string(),
6 => self
.record
.strand()
.unwrap_or(bio_types::strand::Strand::Unknown)
.to_string(),
_ => "no-value".to_string(),
}
}
}
impl fmt::Display for Region {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}:{}-{}", self.name, self.st + 1, self.en)
}
}
pub fn has_overlap(rgn1: &Region, rgn2: &Region) -> bool {
if rgn1.name != rgn2.name {
return false;
}
rgn1.en > rgn2.st && rgn1.st < rgn2.en
}
pub fn get_overlap(rgn1: &Region, rgn2: &Region) -> u64 {
if rgn1.name != rgn2.name {
return 0;
}
let my_min = min(rgn1.en, rgn2.en);
let my_max = max(rgn1.st, rgn2.st);
if my_min < my_max {
return 0;
}
my_min - my_max
}
pub fn parse_region(region: &str) -> Region {
let caps = RGN_RE
.captures(region)
.expect("Failed to parse region string.");
let name = caps.get(1).unwrap().as_str().to_string();
let st = caps.get(2).unwrap().as_str().parse::<u64>().unwrap() - 1;
let en = caps.get(3).unwrap().as_str().parse().unwrap_or(4294967295); let id = caps
.get(4)
.map_or(format!("{}:{}-{}", name, st + 1, en), |m| {
m.as_str().to_string()
});
assert!(
(st <= en),
"Region start must be less than end.\n{}",
region
);
Region {
name,
st,
en,
id,
..Default::default()
}
}
pub fn parse_bed_rec(rec: &str) -> Region {
let mut reader = bed::Reader::new(rec.as_bytes());
let record = reader.records().next().unwrap().unwrap();
parse_bed_record(record)
}
pub fn parse_bed_record(record: bed::Record) -> Region {
let name = record.chrom().to_owned();
let st = record.start();
let en = record.end();
let id = match record.name() {
Some(x) => x.to_owned(),
_ => format!("{}:{}-{}", name, st + 1, en),
};
Region {
name,
st,
en,
id,
record,
}
}
pub fn parse_bed(filename: &str) -> Vec<Region> {
let mut vec = Vec::new();
let reader = myio::reader(filename);
let mut records = bed::Reader::new(reader);
let mut rec_num = 0;
for (idx, rec) in records.records().enumerate() {
match rec {
Ok(r) => {
let rgn = parse_bed_record(r);
vec.push(rgn);
}
Err(e) => log::warn!(
"Unable to parse bed at line {}, skipping. Reason: {}",
idx + 1,
e
),
}
rec_num += 1;
log::debug!("Read bed record number {}", rec_num);
}
vec
}
pub fn split_region(rgn: &Region, window: u64) -> Vec<Region> {
let mut start = rgn.st;
let mut small_rgns = Vec::new();
while start < rgn.en {
let mut end = start + window;
if end > rgn.en {
end = rgn.en;
}
let tmprgn = Region {
name: rgn.name.clone(),
st: start,
en: end,
id: rgn.id.clone(),
..Default::default()
};
small_rgns.push(tmprgn);
start = end;
}
small_rgns
}