use crate::{OxbowError, Result};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum CoordSystem {
OneClosed,
ZeroHalfOpen,
}
impl CoordSystem {
pub fn start_offset_from(self, source_cs: CoordSystem) -> i32 {
match (source_cs, self) {
(CoordSystem::OneClosed, CoordSystem::ZeroHalfOpen) => -1,
(CoordSystem::ZeroHalfOpen, CoordSystem::OneClosed) => 1,
_ => 0,
}
}
}
impl std::fmt::Display for CoordSystem {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
CoordSystem::OneClosed => write!(f, "11"),
CoordSystem::ZeroHalfOpen => write!(f, "01"),
}
}
}
impl std::str::FromStr for CoordSystem {
type Err = OxbowError;
fn from_str(s: &str) -> Result<Self> {
match s {
"11" => Ok(CoordSystem::OneClosed),
"01" => Ok(CoordSystem::ZeroHalfOpen),
other => Err(OxbowError::invalid_input(format!(
"invalid coordinate system '{other}'; expected \"01\" or \"11\""
))),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Region {
pub name: String,
pub start: u64,
pub end: Option<u64>,
}
impl Region {
pub fn new(name: impl Into<String>, start: Option<u64>, end: Option<u64>) -> Self {
Self {
name: name.into(),
start: start.unwrap_or(0),
end,
}
}
pub fn parse(s: &str, coord_system: CoordSystem) -> Result<Self> {
if let Some(result) = Self::try_parse_bracket(s) {
return result;
}
Self::parse_ucsc(s, coord_system)
}
fn parse_ucsc(s: &str, coord_system: CoordSystem) -> Result<Self> {
if s.is_empty() {
return Err(OxbowError::invalid_input("empty region string"));
}
let (name, interval) = match s.rsplit_once(':') {
Some((name, "")) => (name, None),
Some((name, suffix)) => (name, Some(suffix)),
None => (s, None),
};
if name.is_empty() {
return Err(OxbowError::invalid_input("empty reference name"));
}
let (start, end) = match interval {
None => (None, None),
Some(iv) => {
let parts: Vec<&str> = iv.splitn(2, '-').collect();
let start = parse_number(parts[0])?;
let end = if parts.len() == 2 {
Some(parse_number(parts[1])?)
} else {
None
};
(Some(start), end)
}
};
let (start, end) = match coord_system {
CoordSystem::OneClosed => {
(start.map(|s| s.saturating_sub(1)), end)
}
CoordSystem::ZeroHalfOpen => (start, end),
};
Ok(Self::new(name, start, end))
}
fn try_parse_bracket(s: &str) -> Option<Result<Self>> {
let (name, rest) = s.rsplit_once(':')?;
if !rest.starts_with('[') {
return None;
}
let result = (|| {
let rest = &rest[1..];
let (half_open, body) = if let Some(body) = rest.strip_suffix(')') {
(true, body)
} else if let Some(body) = rest.strip_suffix(']') {
(false, body)
} else {
return Err(OxbowError::invalid_input(format!(
"bracket notation must end with ')' or ']': '{s}'"
)));
};
let body: String = body.chars().filter(|c| *c != '_').collect();
let (start_str, end_str) = body.split_once(',').ok_or_else(|| {
OxbowError::invalid_input(format!("bracket notation requires 'start,end': '{s}'"))
})?;
let start = start_str.parse::<u64>().map_err(|_| {
OxbowError::invalid_input(format!("invalid start in bracket notation: '{s}'"))
})?;
let end = end_str.parse::<u64>().map_err(|_| {
OxbowError::invalid_input(format!("invalid end in bracket notation: '{s}'"))
})?;
let (start, end) = if half_open {
(start, end)
} else {
(start.saturating_sub(1), end)
};
Ok(Self::new(name, Some(start), Some(end)))
})();
Some(result)
}
pub fn to_noodles(&self) -> std::result::Result<noodles::core::Region, OxbowError> {
use noodles::core::Position;
match (self.start, self.end) {
(0, None) => Ok(noodles::core::Region::new(self.name.as_str(), ..)),
(s, None) => {
let start = Position::try_from(s as usize + 1)
.map_err(|_| OxbowError::invalid_input("start position out of range"))?;
Ok(noodles::core::Region::new(self.name.as_str(), start..))
}
(s, Some(e)) => {
let start = Position::try_from(s as usize + 1)
.map_err(|_| OxbowError::invalid_input("start position out of range"))?;
let end = Position::try_from(e as usize)
.map_err(|_| OxbowError::invalid_input("end position out of range"))?;
Ok(noodles::core::Region::new(self.name.as_str(), start..=end))
}
}
}
}
impl std::str::FromStr for Region {
type Err = OxbowError;
fn from_str(s: &str) -> Result<Self> {
Self::parse(s, CoordSystem::OneClosed)
}
}
impl std::fmt::Display for Region {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.name)?;
match (self.start, self.end) {
(0, None) => {}
(s, None) => write!(f, ":[{s},)")?,
(s, Some(e)) => write!(f, ":[{s},{e})")?,
}
Ok(())
}
}
fn parse_number(s: &str) -> Result<u64> {
let cleaned: String = s.chars().filter(|c| *c != ',' && *c != '_').collect();
cleaned
.parse::<u64>()
.map_err(|_| OxbowError::invalid_input(format!("invalid number: '{s}'")))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_new() {
let r = Region::new("chr1", Some(100), Some(200));
assert_eq!(r.name, "chr1");
assert_eq!(r.start, 100);
assert_eq!(r.end, Some(200));
}
#[test]
fn test_new_defaults() {
let r = Region::new("chr1", None, None);
assert_eq!(r.start, 0);
assert_eq!(r.end, None);
}
#[test]
fn test_ucsc_one_closed() {
let r = Region::parse("chr1:10001-20000", CoordSystem::OneClosed).unwrap();
assert_eq!(r.name, "chr1");
assert_eq!(r.start, 10000);
assert_eq!(r.end, Some(20000));
}
#[test]
fn test_ucsc_zero_half_open() {
let r = Region::parse("chr1:10000-20000", CoordSystem::ZeroHalfOpen).unwrap();
assert_eq!(r.name, "chr1");
assert_eq!(r.start, 10000);
assert_eq!(r.end, Some(20000));
}
#[test]
fn test_ucsc_whole_chrom() {
let r = Region::parse("chr1", CoordSystem::OneClosed).unwrap();
assert_eq!(r.name, "chr1");
assert_eq!(r.start, 0);
assert_eq!(r.end, None);
}
#[test]
fn test_ucsc_start_only() {
let r = Region::parse("chr1:5000", CoordSystem::OneClosed).unwrap();
assert_eq!(r.start, 4999);
assert_eq!(r.end, None);
}
#[test]
fn test_ucsc_thousands_separators() {
let r = Region::parse("chr1:10,001-20,000", CoordSystem::OneClosed).unwrap();
assert_eq!(r.start, 10000);
assert_eq!(r.end, Some(20000));
let r = Region::parse("chr1:10_001-20_000", CoordSystem::OneClosed).unwrap();
assert_eq!(r.start, 10000);
assert_eq!(r.end, Some(20000));
}
#[test]
fn test_bracket_half_open() {
let r: Region = "chr1:[10000,20000)".parse().unwrap();
assert_eq!(r.start, 10000);
assert_eq!(r.end, Some(20000));
}
#[test]
fn test_bracket_closed() {
let r: Region = "chr1:[10001,20000]".parse().unwrap();
assert_eq!(r.start, 10000);
assert_eq!(r.end, Some(20000));
}
#[test]
fn test_bracket_overrides_coord_system() {
let r = Region::parse("chr1:[10001,20000]", CoordSystem::ZeroHalfOpen).unwrap();
assert_eq!(r.start, 10000); }
#[test]
fn test_bracket_with_separators() {
let r: Region = "chr1:[10_000,20_000)".parse().unwrap();
assert_eq!(r.start, 10000);
assert_eq!(r.end, Some(20000));
}
#[test]
fn test_display_roundtrip() {
let r = Region::new("chr1", Some(10000), Some(20000));
assert_eq!(r.to_string(), "chr1:[10000,20000)");
let parsed: Region = r.to_string().parse().unwrap();
assert_eq!(r, parsed);
}
#[test]
fn test_display_whole_chrom() {
let r = Region::new("chr1", None, None);
assert_eq!(r.to_string(), "chr1");
}
#[test]
fn test_to_noodles_full_range() {
let r = Region::new("chr1", Some(10000), Some(20000));
let nr = r.to_noodles().unwrap();
assert_eq!(nr.name(), &b"chr1"[..]);
let start = noodles::core::Position::try_from(10001).unwrap();
let end = noodles::core::Position::try_from(20000).unwrap();
assert_eq!(nr.start(), std::ops::Bound::Included(start));
assert_eq!(nr.end(), std::ops::Bound::Included(end));
}
#[test]
fn test_to_noodles_whole_chrom() {
let r = Region::new("chr1", None, None);
let nr = r.to_noodles().unwrap();
assert_eq!(nr.start(), std::ops::Bound::Unbounded);
assert_eq!(nr.end(), std::ops::Bound::Unbounded);
}
#[test]
fn test_empty_string_errors() {
assert!(Region::parse("", CoordSystem::OneClosed).is_err());
}
#[test]
fn test_invalid_bracket_notation() {
assert!("chr1:[10000,20000".parse::<Region>().is_err());
assert!("chr1:[10000)".parse::<Region>().is_err());
}
}