use fnv::FnvBuildHasher;
use std::{cmp::Ordering, collections::HashMap};
use thiserror::Error;
#[derive(Clone, Debug)]
pub struct GenomeMap<T> {
lookup: HashMap<String, usize, FnvBuildHasher>,
reverse_lookup: HashMap<usize, String, FnvBuildHasher>,
values: Vec<T>,
pub sorted_keys: Vec<String>,
}
impl<T> Default for GenomeMap<T> {
fn default() -> Self {
Self {
lookup: HashMap::default(),
reverse_lookup: HashMap::default(),
values: Vec::default(),
sorted_keys: Vec::with_capacity(50),
}
}
}
#[derive(Debug, Error)]
pub enum GenomeMapError {
#[error("GenomeMap.insert() error: already contains the sequence '{0}'")]
SequenceMapContainsSeqname(String),
}
impl<W, T> PartialEq<GenomeMap<W>> for GenomeMap<T>
where
T: PartialEq<W>,
W: IntoIterator,
T: IntoIterator,
{
fn eq(&self, other: &GenomeMap<W>) -> bool {
self.iter()
.zip(other.iter())
.all(|((k1, v1), (k2, v2))| k1 == k2 && v1 == v2)
}
}
impl<T> GenomeMap<T> {
pub fn new() -> Self {
GenomeMap::default()
}
pub fn get(&self, name: &str) -> Option<&T> {
self.lookup.get(name).map(|idx| &self.values[*idx])
}
pub fn get_mut(&mut self, name: &str) -> Option<&mut T> {
self.lookup.get_mut(name).map(|idx| &mut self.values[*idx])
}
pub fn get_by_index(&self, index: usize) -> Option<&T> {
let name = self.get_name_by_index(index);
name.and_then(|key| self.get(key))
}
pub fn get_name_by_index(&self, index: usize) -> Option<&str> {
self.sorted_keys.get(index).as_ref().map(|x| x.as_str())
}
pub fn get_index_by_name(&self, name: &str) -> Option<usize> {
match self
.sorted_keys
.binary_search_by(|probe| chromosome_probe(probe, name))
{
Ok(index) => Some(index),
Err(_) => None,
}
}
pub fn names(&self) -> Vec<String> {
self.sorted_keys.clone()
}
pub fn len(&self) -> usize {
let len = self.lookup.len();
assert_eq!(len, self.values.len());
len
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn indices(&self) -> std::ops::Range<usize> {
std::ops::Range {
start: 0,
end: self.len(),
}
}
pub fn contains(&self, name: &str) -> bool {
self.lookup.contains_key(name)
}
pub fn insert(&mut self, name: &str, value: T) -> Result<usize, GenomeMapError> {
if self.contains(name) {
return Err(GenomeMapError::SequenceMapContainsSeqname(name.to_string()));
}
self.values.push(value);
let insertion_index = self.values.len() - 1;
self.lookup.insert(name.to_string(), insertion_index);
self.reverse_lookup
.insert(insertion_index, name.to_string());
let index = match self
.sorted_keys
.binary_search_by(|probe| chromosome_probe(probe, name))
{
Ok(_) => {
panic!("An internal error was encountered. Please report an issue: https://github.com/vsbuffalo/genomemap/issues");
}
Err(pos) => {
self.sorted_keys.insert(pos, name.to_string());
pos
}
};
Ok(index)
}
pub fn entry_or_default(&mut self, name: &str) -> &mut T
where
T: Default,
{
let insertion_index = match self.lookup.get(name) {
Some(&index) => index,
None => {
let new_entry = T::default();
let _ = self.insert(name, new_entry).unwrap();
*self.lookup.get(name).unwrap()
}
};
&mut self.values[insertion_index]
}
pub fn iter(&self) -> impl Iterator<Item = (&String, &T)> {
self.sorted_keys
.iter()
.map(|name| (name, self.get(name).unwrap()))
}
pub fn values_mut(&mut self) -> std::slice::IterMut<'_, T> {
self.values.iter_mut()
}
pub fn values(&self) -> impl Iterator<Item = &T> {
self.sorted_keys.iter().map(|name| self.get(name).unwrap())
}
pub fn shrink_to_fit(&mut self) {
self.lookup.shrink_to_fit();
self.reverse_lookup.shrink_to_fit();
self.sorted_keys.shrink_to_fit();
}
}
impl<T> IntoIterator for GenomeMap<T> {
type Item = (String, T);
type IntoIter = std::vec::IntoIter<Self::Item>;
fn into_iter(self) -> Self::IntoIter {
let pairs = self
.sorted_keys
.into_iter()
.zip(self.values)
.collect::<Vec<_>>();
pairs.into_iter()
}
}
impl<'a, T> FromIterator<(String, T)> for GenomeMap<T>
where
T: 'a,
{
fn from_iter<I: IntoIterator<Item = (String, T)>>(iter: I) -> Self {
let mut genomap = GenomeMap::new();
for (name, item) in iter {
let msg = format!(
"Invalid iterator: contains multiple entries for '{}', which would be clobbered.",
name
);
genomap.insert(&name, item).expect(&msg);
}
genomap
}
}
pub fn chromosome_probe(name: &str, target: &str) -> Ordering {
let extract_parts = |chr: &str| -> (u8, Option<u32>, String, u8) {
let chr_trimmed = chr.trim_start_matches("chr");
match chr_trimmed {
"X" => (2, None, chr_trimmed.to_string(), 0),
"Y" => (3, None, chr_trimmed.to_string(), 0),
"M" | "MT" | "Mt" => (4, None, chr_trimmed.to_string(), 0),
"Z" => (5, None, chr_trimmed.to_string(), 0),
"W" => (6, None, chr_trimmed.to_string(), 0),
"O" => (7, None, chr_trimmed.to_string(), 0),
_ if chr_trimmed.ends_with('L') || chr_trimmed.ends_with('R') => {
let (num, arm) = chr_trimmed.split_at(chr_trimmed.len() - 1);
if let Ok(num) = num.parse::<u32>() {
let arm_priority = if arm == "L" { 1 } else { 2 }; (1, Some(num), String::new(), arm_priority)
} else {
(8, None, chr_trimmed.to_string(), 0)
}
}
_ if chr_trimmed.parse::<u32>().is_ok() => {
(1, Some(chr_trimmed.parse().unwrap()), String::new(), 0)
}
_ => (8, None, chr_trimmed.to_string(), 0),
}
};
let name_parts = extract_parts(name);
let target_parts = extract_parts(target);
name_parts.cmp(&target_parts)
}
#[cfg(test)]
mod test {
use super::GenomeMap;
#[test]
fn test_genomemap_new() {
let mut sm: GenomeMap<i32> = GenomeMap::new();
assert!(sm.is_empty());
sm.insert("chr1", 1).unwrap();
sm.insert("chr2", 2).unwrap();
assert_eq!(sm.indices(), 0..2);
assert_eq!(sm.len(), 2);
assert!(!sm.is_empty());
let mut sm: GenomeMap<i32> = GenomeMap::new();
sm.insert("chr2", 1).unwrap();
sm.insert("chr1", 2).unwrap();
assert_eq!(sm.indices(), 0..2);
}
#[test]
fn test_genomemap_contains() {
let mut sm: GenomeMap<i32> = GenomeMap::new();
assert!(sm.is_empty());
sm.insert("chr1", 1).unwrap();
sm.insert("chr2", 2).unwrap();
assert!(sm.contains("chr1"));
assert!(sm.contains("chr2"));
assert!(!sm.contains("chrX"));
}
#[test]
fn test_genomemap_shrink() {
let mut sm: GenomeMap<i32> = GenomeMap::new();
sm.insert("chr1", 1).unwrap();
sm.insert("chr2", 2).unwrap();
sm.shrink_to_fit();
}
#[test]
#[should_panic]
fn test_genomemap_clobber() {
let mut sm: GenomeMap<i32> = GenomeMap::new();
sm.insert("chr1", 1).unwrap();
sm.insert("chr1", 2).unwrap();
}
#[test]
fn test_entry_or_default() {
let mut sm: GenomeMap<Vec<i32>> = GenomeMap::new();
let vec = sm.entry_or_default("chr2");
assert!(vec.is_empty());
vec.push(1);
vec.push(2);
let vec = sm.entry_or_default("chr1");
vec.push(1);
let vec_chr1 = sm.get("chr2");
assert_eq!(*vec_chr1.unwrap(), vec![1, 2]);
assert_eq!(sm.len(), 2);
}
#[test]
fn test_genomemap_get_mut() {
let mut sm: GenomeMap<i32> = GenomeMap::new();
sm.insert("chr1", 1).unwrap();
sm.insert("chr2", 2).unwrap();
assert_eq!(*sm.get("chr1").unwrap(), 1);
let val = sm.get_mut("chr1").unwrap();
*val = 10;
assert_eq!(*sm.get("chr1").unwrap(), 10);
}
#[test]
fn test_genomemap_get() {
let mut sm: GenomeMap<i32> = GenomeMap::new();
sm.insert("chr1", 1).unwrap();
sm.insert("chr2", 2).unwrap();
assert_eq!(*sm.get("chr1").unwrap(), 1);
assert_eq!(*sm.get("chr2").unwrap(), 2);
assert_eq!(sm.get("chr4"), None);
let mut sm: GenomeMap<i32> = GenomeMap::new();
sm.insert("chr2", 2).unwrap();
sm.insert("chr1", 1).unwrap();
assert_eq!(*sm.get("chr1").unwrap(), 1);
assert_eq!(*sm.get("chr2").unwrap(), 2);
assert_eq!(sm.get("chr4"), None);
}
#[test]
fn test_genomemap_get_by_index() {
let mut sm: GenomeMap<i32> = GenomeMap::new();
sm.insert("chr1", 1).unwrap();
sm.insert("chr2", 2).unwrap();
let chr1_idx = sm.get_index_by_name("chr1");
assert_eq!(chr1_idx.unwrap(), 0);
let chr2_idx = sm.get_index_by_name("chr2");
assert_eq!(chr2_idx.unwrap(), 1);
let chr3_idx = sm.get_index_by_name("chr3");
assert_eq!(chr3_idx, None);
assert_eq!(*sm.get("chr1").unwrap(), 1);
assert_eq!(*sm.get("chr2").unwrap(), 2);
assert_eq!(sm.get("chr3"), None);
assert_eq!(*sm.get_by_index(chr1_idx.unwrap()).unwrap(), 1);
assert_eq!(*sm.get_by_index(chr2_idx.unwrap()).unwrap(), 2);
}
#[test]
fn test_map_get_name_by_index_and_name() {
let mut sm: GenomeMap<i32> = GenomeMap::new();
sm.insert("chr1", 1).unwrap();
sm.insert("chr2", 2).unwrap();
let index = sm.get_index_by_name("chr1").unwrap();
assert_eq!(*sm.get_by_index(index).unwrap(), 1);
assert_eq!(sm.get_name_by_index(index).unwrap(), "chr1");
}
#[test]
fn test_map_get_name_by_index() {
let mut sm: GenomeMap<i32> = GenomeMap::new();
sm.insert("chr1", 1).unwrap();
sm.insert("chr2", 2).unwrap();
assert_eq!(sm.get_name_by_index(0).unwrap(), "chr1");
assert_eq!(sm.get_name_by_index(1).unwrap(), "chr2");
let mut sm: GenomeMap<i32> = GenomeMap::new();
sm.insert("chr2", 2).unwrap();
sm.insert("chr1", 1).unwrap();
assert_eq!(sm.get_name_by_index(0).unwrap(), "chr1");
assert_eq!(sm.get_name_by_index(1).unwrap(), "chr2");
}
#[test]
fn test_sorting_human() {
let mut sm: GenomeMap<i32> = GenomeMap::new();
sm.insert("chr2", 2).unwrap();
sm.insert("chr1", 1).unwrap();
sm.insert("chr22", 3).unwrap();
sm.insert("chrX", 4).unwrap();
sm.insert("chrY", 5).unwrap();
sm.insert("chrMt", 6).unwrap();
let expected_order = vec!["chr1", "chr2", "chr22", "chrX", "chrY", "chrMt"];
assert_eq!(sm.names(), expected_order);
let mut sm: GenomeMap<i32> = GenomeMap::new();
sm.insert("2", 2).unwrap();
sm.insert("1", 1).unwrap();
sm.insert("22", 3).unwrap();
sm.insert("X", 4).unwrap();
sm.insert("Y", 5).unwrap();
sm.insert("Mt", 6).unwrap();
let expected_order = vec!["1", "2", "22", "X", "Y", "Mt"];
assert_eq!(sm.names(), expected_order);
}
#[test]
fn test_sorting_drosophila() {
let mut sm: GenomeMap<i32> = GenomeMap::new();
sm.insert("chr2L", 2).unwrap();
sm.insert("chr1", 1).unwrap();
sm.insert("chr2R", 3).unwrap();
sm.insert("chrX", 5).unwrap();
sm.insert("chrY", 6).unwrap();
sm.insert("chr4", 4).unwrap();
let expected_order = vec!["chr1", "chr2L", "chr2R", "chr4", "chrX", "chrY"];
assert_eq!(sm.names(), expected_order);
}
#[test]
fn test_into_iter() {
let mut sm: GenomeMap<i32> = GenomeMap::new();
sm.insert("chr1", 1).unwrap();
sm.insert("chr2", 2).unwrap();
let mut iter = sm.into_iter();
assert_eq!(iter.next(), Some(("chr1".to_string(), 1)));
assert_eq!(iter.next(), Some(("chr2".to_string(), 2)));
assert!(iter.next().is_none());
}
}