use crate::Library;
use anyhow::{anyhow, Result};
use bstr::{io::BufReadExt, ByteSlice};
use hashbrown::HashMap;
use std::{fs::File, io::BufReader, path::Path};
#[derive(Debug)]
pub struct GeneMap {
map: HashMap<Vec<u8>, Vec<u8>>,
}
impl GeneMap {
pub fn from_hashmap(map: HashMap<Vec<u8>, Vec<u8>>) -> Self {
Self { map }
}
pub fn new(path: &str) -> Result<Self> {
Self::validate_path(path)?;
let map = Self::build_from_file(path)?;
Ok(Self { map })
}
pub fn new_from_buffer<R: BufReadExt>(buffer: R) -> Result<Self> {
let map = Self::build(buffer)?;
Ok(Self { map })
}
fn validate_path(path: &str) -> Result<()> {
if Path::new(path).exists() {
Ok(())
} else {
Err(anyhow!(
"Provided gene mapping path doesn't exist: {}",
path
))
}
}
fn build_from_file(path: &str) -> Result<HashMap<Vec<u8>, Vec<u8>>> {
let file = File::open(path)?;
let buffer = BufReader::new(file);
Self::build(buffer)
}
fn build<R: BufReadExt>(mut buffer: R) -> Result<HashMap<Vec<u8>, Vec<u8>>> {
let mut map = HashMap::new();
buffer.for_byte_line(|line| {
let pos = line
.find_byte(b'\t')
.unwrap_or_else(|| panic!("Missing '\t' in gene map"));
let (gene, sgrna) = line.split_at(pos);
assert!(
map.insert(sgrna[1..].to_vec(), gene.to_vec()).is_none(),
"Duplicate sgRNA key found in gene map: {}",
std::str::from_utf8(&sgrna[1..]).expect("invalid utf8")
);
Ok(true)
})?;
Ok(map)
}
#[must_use]
pub fn get(&self, sgrna: &[u8]) -> Option<&Vec<u8>> {
self.map.get(sgrna)
}
#[must_use]
pub fn missing_aliases(&self, library: &Library) -> Option<Vec<u8>> {
library
.values()
.find(|alias| self.get(alias).is_none())
.map(|alias| alias.to_vec())
}
}
#[cfg(test)]
mod testing {
use hashbrown::HashMap;
use crate::Library;
fn build_example_buffer() -> String {
"gene1\tsgrna1\n\
gene2\tsgrna2\n\
gene3\tsgrna3\n"
.to_string()
}
fn build_library() -> Library {
let map = vec![
(b"ACTG".to_vec(), b"sgrna1".to_vec()),
(b"gtca".to_vec(), b"sgrna2".to_vec()),
(b"TCAG".to_vec(), b"sgrna3".to_vec()),
]
.into_iter()
.collect::<HashMap<_, _>>();
Library::from_hashmap(map).unwrap()
}
fn build_invalid_library() -> Library {
let map = vec![
(b"ACTG".to_vec(), b"sgrna1".to_vec()),
(b"gtca".to_vec(), b"sgrna4".to_vec()),
]
.into_iter()
.collect::<HashMap<_, _>>();
Library::from_hashmap(map).unwrap()
}
#[test]
fn test_build() {
let buffer = build_example_buffer();
let genemap = super::GeneMap::new_from_buffer(buffer.as_bytes()).unwrap();
assert_eq!(genemap.get(b"sgrna1").unwrap(), &b"gene1"[..]);
assert_eq!(genemap.get(b"sgrna2").unwrap(), &b"gene2"[..]);
assert_eq!(genemap.get(b"sgrna3").unwrap(), &b"gene3"[..]);
}
#[test]
fn test_validate_library() {
let buffer = build_example_buffer();
let genemap = super::GeneMap::new_from_buffer(buffer.as_bytes()).unwrap();
let library = build_library();
let missing = genemap.missing_aliases(&library);
assert!(missing.is_none());
}
#[test]
fn test_validate_library_invalid() {
let buffer = build_example_buffer();
let genemap = super::GeneMap::new_from_buffer(buffer.as_bytes()).unwrap();
let library = build_invalid_library();
let missing = genemap.missing_aliases(&library);
assert_eq!(missing.unwrap(), b"sgrna4");
}
#[test]
fn test_from_file() {
let filepath = "example/g2s.txt";
let genemap = super::GeneMap::new(filepath).unwrap();
assert_eq!(genemap.get(b"lib.0").unwrap(), &b"gene.0"[..]);
assert_eq!(genemap.get(b"lib.99").unwrap(), &b"gene.9"[..]);
}
}