use std::{
fmt::Display,
mem,
num::TryFromIntError,
ops::{Add, AddAssign, Sub, SubAssign},
sync::LazyLock,
};
use bstr::ByteSlice;
use rust_htslib::bam::{self, ext::BamRecordExtensions};
use serde::{Deserialize, Serialize};
use crate::common::contig::ContigName;
#[derive(Clone, Debug, PartialEq, Eq, Hash, Deserialize, Serialize)]
pub struct GenomePosition {
contig: ContigName,
position: usize,
}
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Hash, Deserialize, Serialize)]
pub struct GenomeRegion {
start: GenomePosition,
len: Option<usize>,
}
impl GenomePosition {
pub fn new_0(contig: ContigName, position: usize) -> Self {
Self { contig, position }
}
pub fn new_1(contig: ContigName, position: usize) -> Self {
Self {
contig,
position: position - 1,
}
}
pub fn contig(&self) -> &ContigName {
&self.contig
}
pub fn position_0(&self) -> usize {
self.position
}
pub fn abs_diff(&self, other: &Self) -> usize {
self.position.abs_diff(other.position)
}
}
impl PartialOrd for GenomePosition {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
self.contig
.eq(&other.contig)
.then_some(self.position.cmp(&other.position))
}
}
impl AddAssign<usize> for GenomePosition {
fn add_assign(&mut self, rhs: usize) {
self.position += rhs;
}
}
impl Add<usize> for GenomePosition {
type Output = GenomePosition;
fn add(mut self, rhs: usize) -> Self::Output {
self += rhs;
self
}
}
impl SubAssign<usize> for GenomePosition {
fn sub_assign(&mut self, rhs: usize) {
self.position -= rhs;
}
}
impl Sub<usize> for GenomePosition {
type Output = GenomePosition;
fn sub(mut self, rhs: usize) -> Self::Output {
self -= rhs;
self
}
}
#[allow(dead_code)]
impl GenomeRegion {
pub fn new_bounded(start: GenomePosition, len: usize) -> Self {
Self {
start,
len: Some(len),
}
}
pub fn new_unbounded(start: GenomePosition) -> Self {
Self { start, len: None }
}
pub fn contig(&self) -> &ContigName {
self.start.contig()
}
pub fn from_incl_incl(
start: GenomePosition,
end: Option<GenomePosition>,
) -> anyhow::Result<Self> {
Self::from_incl_excl(start, end.map(|end| end + 1))
}
pub fn from_incl_excl(
start: GenomePosition,
end: Option<GenomePosition>,
) -> anyhow::Result<Self> {
if let Some(end) = &end {
if start.contig() != end.contig() {
anyhow::bail!("Cannot create GenomeRegion spanning multiple contigs");
}
if start.position_0() > end.position_0() {
anyhow::bail!("Cannot create GenomeRegion with start after end");
}
}
let len = end.map(|end| end.position_0() - start.position_0());
Ok(Self { start, len })
}
pub fn start(&self) -> &GenomePosition {
&self.start
}
pub fn len(&self) -> Option<usize> {
self.len
}
pub fn is_bounded(&self) -> bool {
self.len.is_some()
}
pub fn end_excl(&self) -> Option<GenomePosition> {
self.len.map(|l| self.start.clone() + l)
}
pub fn contains_pos(&self, pos: &GenomePosition) -> bool {
if self.start.contig != pos.contig {
return false;
}
if pos.position < self.start.position {
return false;
}
match self.end_excl() {
Some(end) => *pos < end,
None => true,
}
}
pub fn contains(&self, other: &GenomeRegion) -> bool {
if self.start.contig() != other.contig() {
return false;
}
self.start() <= other.start()
&& match (self.end_excl(), other.end_excl()) {
(None, None | Some(_)) => true,
(Some(_), None) => false,
(Some(this), Some(other)) => this >= other,
}
}
pub fn intersection(&self, other: &Self) -> Option<Self> {
if self.contig() != other.contig() {
return None;
}
let intersection_start = self.start.position_0().max(other.start.position_0());
let intersection_end = match (self.end_excl(), other.end_excl()) {
(None, None) => None,
(None, Some(end)) | (Some(end), None) => Some(end.position_0()),
(Some(self_end), Some(other_end)) => {
Some(self_end.position_0().min(other_end.position_0()))
}
};
let intersection_len = match intersection_end {
Some(end) if end > intersection_start => Some(end - intersection_start),
Some(_) => return None, None => None, };
Some(Self {
start: GenomePosition {
contig: self.start.contig.clone(),
position: intersection_start,
},
len: intersection_len,
})
}
pub fn merge(&self, other: &Self) -> Option<Self> {
if self.contig() != other.contig() {
return None;
}
let (mut this, mut other) = (self, other);
if self > other {
mem::swap(&mut this, &mut other);
}
if let Some(this_end) = this.end_excl() {
if &this_end < other.start() {
None
} else {
let end = other.end_excl().map(|other_end| {
if this_end >= other_end {
this_end
} else {
other_end
}
});
Some(GenomeRegion::from_incl_excl(this.start().clone(), end).ok()?)
}
} else {
Some(this.clone())
}
}
pub fn parse(s: &[u8]) -> anyhow::Result<Self> {
let Some(result) = parse_region_pos_fmt(s).or_else(|| parse_region_bed_fmt(s)) else {
anyhow::bail!("Cannot parse the region '{}'", s.as_bstr());
};
Ok(result)
}
}
static REGION_REGEX_POS_FORMAT: LazyLock<regex::bytes::Regex> =
LazyLock::new(|| regex::bytes::Regex::new(r"^(\w+)(?::(\d+)-(\d+)?)?$").unwrap());
fn parse_region_pos_fmt(s: &[u8]) -> Option<GenomeRegion> {
let captures = REGION_REGEX_POS_FORMAT.captures(s)?;
let chr = captures.get(1).unwrap();
let pos_1_incl = captures.get(2).and_then(extract_match).unwrap_or(1);
let end_1_incl = if let Some(end) = captures.get(3) {
extract_match(end)
} else {
None
};
let contig = ContigName::new(chr.as_bytes());
let start_incl = GenomePosition::new_1(contig.clone(), pos_1_incl);
let end_incl = end_1_incl.map(|end| GenomePosition::new_1(contig.clone(), end));
GenomeRegion::from_incl_incl(start_incl, end_incl).ok()
}
fn extract_match(m: regex::bytes::Match) -> Option<usize> {
let pos_1_incl: usize = m.as_bytes().to_str().ok()?.parse().ok()?;
Some(pos_1_incl)
}
static REGION_REGEX_BED_FORMAT: LazyLock<regex::bytes::Regex> =
LazyLock::new(|| regex::bytes::Regex::new(r"^(\w+)\t(\d+)\t(\d+)$").unwrap());
fn parse_region_bed_fmt(s: &[u8]) -> Option<GenomeRegion> {
let captures = REGION_REGEX_BED_FORMAT.captures(s)?;
let chr = captures.get(1).unwrap();
let pos_0_incl: usize = extract_match(captures.get(2)?)?;
let end_0_excl = extract_match(captures.get(3)?)?;
let contig = ContigName::new(chr.as_bytes());
let start_incl = GenomePosition::new_0(contig.clone(), pos_0_incl);
let end_excl = GenomePosition::new_0(contig.clone(), end_0_excl);
GenomeRegion::from_incl_excl(start_incl, Some(end_excl)).ok()
}
impl Display for GenomePosition {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}:{}", self.contig.as_ref(), self.position_0() + 1)
}
}
impl Display for GenomeRegion {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if let Some(len) = self.len() {
let end = self.start.position_0() + len - 1;
write!(f, "{}-{}", self.start(), end + 1)
} else {
write!(f, "{}-", self.start())
}
}
}
impl TryFrom<&rust_htslib::bcf::Record> for GenomeRegion {
type Error = TryFromIntError;
fn try_from(rec: &rust_htslib::bcf::Record) -> Result<Self, Self::Error> {
let chr = rec.header().rid2name(rec.rid().unwrap()).unwrap();
let pos = rec.pos();
let gpos = GenomePosition::new_0(chr.into(), usize::try_from(pos)?);
Ok(GenomeRegion::new_bounded(
gpos,
usize::try_from(rec.end() - pos)?,
))
}
}
impl GenomeRegion {
pub fn try_from_bam_record(
rec: &rust_htslib::bam::Record,
chr: ContigName,
) -> anyhow::Result<Self> {
let pos = rec.pos();
let gpos = GenomePosition::new_0(chr, pos.try_into()?);
let len = rec.reference_end() - pos;
Ok(GenomeRegion::new_bounded(gpos, usize::try_from(len)?))
}
}
impl<'a> TryFrom<&'a GenomeRegion> for bam::FetchDefinition<'a> {
type Error = anyhow::Error;
fn try_from(value: &'a GenomeRegion) -> Result<Self, Self::Error> {
Ok(match (value.start().position_0(), value.end_excl()) {
(0, None) => bam::FetchDefinition::String(value.contig().as_ref()),
(_, None) => anyhow::bail!(
"A region like {value} cannot be used for fetching from bam files. Use regions that either have an end coordinate, or include the entire sequence (e.g. chr1, chr1:100-200)"
),
(n, Some(end)) => bam::FetchDefinition::RegionString(
value.contig().as_ref(),
i64::try_from(n)?,
i64::try_from(end.position_0())?,
),
})
}
}
#[cfg(test)]
mod tests {
use super::*;
fn pos(contig: &str, p: usize) -> GenomePosition {
GenomePosition::new_0(ContigName::from(contig), p)
}
fn region(contig: &str, start: usize, len: usize) -> GenomeRegion {
GenomeRegion::new_bounded(pos(contig, start), len)
}
#[test]
fn parse_pos_format_range() {
let r = GenomeRegion::parse(b"chr1:100-200").unwrap();
assert_eq!(r.start().position_0(), 99);
assert_eq!(r.len(), Some(101));
}
#[test]
fn parse_pos_format_whole_contig() {
let r = GenomeRegion::parse(b"chr1").unwrap();
assert_eq!(r.start().position_0(), 0);
assert!(!r.is_bounded());
}
#[test]
fn parse_pos_format_open_ended() {
let r = GenomeRegion::parse(b"chr1:5-").unwrap();
assert_eq!(r.start().position_0(), 4);
assert!(!r.is_bounded());
}
#[test]
fn parse_bed_format_multi_char_contig() {
let r = GenomeRegion::parse(b"chr1\t99\t200").unwrap();
assert_eq!(r.start().position_0(), 99);
assert_eq!(r.len(), Some(101));
}
#[test]
fn parse_invalid_returns_error() {
assert!(GenomeRegion::parse(b"!!!invalid!!!").is_err());
}
#[test]
fn merge_overlapping_other_extends_further() {
let r1 = region("chr1", 10, 10);
let r2 = region("chr1", 15, 15);
let m = r1.merge(&r2).unwrap();
assert_eq!(m.start().position_0(), 10);
assert_eq!(m.len(), Some(20));
}
#[test]
fn merge_overlapping_this_extends_further() {
let r1 = region("chr1", 10, 25);
let r2 = region("chr1", 15, 10);
let m = r1.merge(&r2).unwrap();
assert_eq!(m.start().position_0(), 10);
assert_eq!(m.len(), Some(25));
}
#[test]
fn merge_adjacent_regions() {
let r1 = region("chr1", 10, 10);
let r2 = region("chr1", 20, 10);
let m = r1.merge(&r2).unwrap();
assert_eq!(m.start().position_0(), 10);
assert_eq!(m.len(), Some(20));
}
#[test]
fn merge_no_overlap_returns_none() {
let r1 = region("chr1", 10, 5);
let r2 = region("chr1", 20, 10);
assert!(r1.merge(&r2).is_none());
}
#[test]
fn merge_different_contigs_returns_none() {
let r1 = region("chr1", 10, 10);
let r2 = region("chr2", 10, 10);
assert!(r1.merge(&r2).is_none());
}
#[test]
fn merge_is_commutative() {
let r1 = region("chr1", 10, 10);
let r2 = region("chr1", 15, 20);
assert_eq!(r1.merge(&r2), r2.merge(&r1));
}
}