use anyhow::{Context, Result};
use rustc_hash::FxHashMap;
use std::fs::File;
use std::io::{BufRead, BufReader};
#[derive(Debug, Clone, Copy)]
pub struct Region {
pub start: usize,
pub end: usize,
pub strand: Option<u8>,
}
impl Region {
pub fn new(start: usize, end: usize, strand: Option<u8>) -> Self {
Self { start, end, strand }
}
}
#[derive(Debug, Clone)]
pub struct RegionList {
pub regions: Vec<(u64, u64)>,
pub strands: Vec<i8>,
}
impl RegionList {
pub fn new() -> Self {
Self {
regions: Vec::new(),
strands: Vec::new(),
}
}
pub fn add_region(&mut self, start: u64, end: u64, strand: i8) {
self.regions.push((start, end));
self.strands.push(strand);
}
pub fn iter(&self) -> impl Iterator<Item = Region> + '_ {
self.regions
.iter()
.zip(&self.strands)
.map(|(&(start, end), &strand)| Region {
start: start as usize,
end: end as usize,
strand: match strand {
1 => Some(b'+'),
-1 => Some(b'-'),
_ => None,
},
})
}
pub fn contains(&self, pos: u64) -> bool {
self.regions
.iter()
.any(|&(start, end)| pos >= start && pos < end)
}
pub fn len(&self) -> usize {
self.regions.len()
}
pub fn is_empty(&self) -> bool {
self.regions.is_empty()
}
}
impl Default for RegionList {
fn default() -> Self {
Self::new()
}
}
pub type RegionHash = FxHashMap<Vec<u8>, RegionList>;
pub fn parse_regions(path: &str) -> Result<RegionHash> {
let file = File::open(path).with_context(|| format!("无法打开区域文件: {}", path))?;
let reader = BufReader::new(file);
let mut hash: RegionHash = FxHashMap::default();
for (line_num, line) in reader.lines().enumerate() {
let line = line.with_context(|| format!("读取文件{}第{}行失败", path, line_num + 1))?;
if line.is_empty() || line.starts_with('#') {
continue;
}
let fields: Vec<&str> = line.split('\t').collect();
if fields.is_empty() {
continue;
}
let name = fields[0].as_bytes().to_vec();
let entry = hash.entry(name).or_default();
if fields.len() == 1 {
entry.add_region(0, u64::MAX, 0);
continue;
}
let start: u64 = fields.get(1).and_then(|s| s.parse().ok()).unwrap_or(0);
let end: u64 = fields
.get(2)
.and_then(|s| s.parse().ok())
.unwrap_or(u64::MAX);
let strand: i8 = fields
.get(5)
.map(|s| match *s {
"+" => 1,
"-" => -1,
_ => 0,
})
.unwrap_or(0);
entry.add_region(start, end, strand);
}
Ok(hash)
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
#[test]
fn test_region_list_basic() {
let mut list = RegionList::new();
assert!(list.is_empty());
assert_eq!(list.len(), 0);
list.add_region(100, 200, 1);
assert!(!list.is_empty());
assert_eq!(list.len(), 1);
assert!(list.contains(100));
assert!(list.contains(150));
assert!(list.contains(199));
assert!(!list.contains(200));
assert!(!list.contains(99));
}
#[test]
fn test_parse_name_list() -> Result<()> {
let mut temp_file = NamedTempFile::new()?;
writeln!(temp_file, "seq1")?;
writeln!(temp_file, "seq2")?;
writeln!(temp_file, "# 注释行")?;
writeln!(temp_file, "")?; writeln!(temp_file, "seq3")?;
let regions = parse_regions(temp_file.path().to_str().unwrap())?;
assert_eq!(regions.len(), 3);
assert!(regions.contains_key(b"seq1".as_slice()));
assert!(regions.contains_key(b"seq2".as_slice()));
assert!(regions.contains_key(b"seq3".as_slice()));
let seq1_regions = ®ions[b"seq1".as_slice()];
assert_eq!(seq1_regions.len(), 1);
assert_eq!(seq1_regions.regions[0], (0, u64::MAX));
Ok(())
}
#[test]
fn test_parse_bed_format() -> Result<()> {
let mut temp_file = NamedTempFile::new()?;
writeln!(temp_file, "chr1\t100\t200\tregion1\t0\t+")?;
writeln!(temp_file, "chr1\t300\t400\tregion2\t0\t-")?;
writeln!(temp_file, "chr2\t500\t600")?;
let regions = parse_regions(temp_file.path().to_str().unwrap())?;
assert_eq!(regions.len(), 2);
let chr1_regions = ®ions[b"chr1".as_slice()];
assert_eq!(chr1_regions.len(), 2);
assert_eq!(chr1_regions.regions[0], (100, 200));
assert_eq!(chr1_regions.strands[0], 1); assert_eq!(chr1_regions.regions[1], (300, 400));
assert_eq!(chr1_regions.strands[1], -1);
let chr2_regions = ®ions[b"chr2".as_slice()];
assert_eq!(chr2_regions.len(), 1);
assert_eq!(chr2_regions.regions[0], (500, 600));
assert_eq!(chr2_regions.strands[0], 0);
Ok(())
}
#[test]
fn test_multiple_regions_same_chr() -> Result<()> {
let mut temp_file = NamedTempFile::new()?;
writeln!(temp_file, "chr1\t100\t200")?;
writeln!(temp_file, "chr1\t300\t400")?;
writeln!(temp_file, "chr1\t500\t600")?;
let regions = parse_regions(temp_file.path().to_str().unwrap())?;
let chr1_regions = ®ions[b"chr1".as_slice()];
assert_eq!(chr1_regions.len(), 3);
assert!(chr1_regions.contains(150));
assert!(chr1_regions.contains(350));
assert!(chr1_regions.contains(550));
assert!(!chr1_regions.contains(250));
Ok(())
}
}