use genomap::{GenomeMap, GenomeMapError};
use indexmap::map::IndexMap;
use ndarray::Array2;
use num_traits::Float;
use serde::{Deserialize, Serialize};
use std::fmt::Display;
use std::fmt::LowerExp;
use std::io;
use std::io::BufRead;
use std::io::Write;
use thiserror::Error;
use crate::file::OutputFile;
use crate::numeric::recomb_dist_matrix;
use super::file::{FileError, InputFile};
use super::numeric::interp1d;
pub type RateFloat = f64;
#[cfg(not(feature = "big-position"))]
pub type Position = u32;
#[cfg(feature = "big-position")]
pub type Position = u64;
#[cfg(not(feature = "big-position"))]
pub type PositionOffset = i32;
#[cfg(feature = "big-position")]
pub type PositionOffset = i64;
pub const CM_MB_CONVERSION: RateFloat = 1e-8;
pub const RATE_PRECISION: usize = 8;
pub const SCI_NOTATION_THRESH: usize = 8;
#[derive(Error, Debug)]
pub enum RecMapError {
#[error("HapMap parsing error: {0}")]
HapMapParsingError(#[from] csv::Error),
#[error("IO error: {0}")]
IOError(#[from] io::Error),
#[error("File reading eror: {0}")]
FileError(#[from] FileError),
#[error("Missing field")]
MissingField,
#[error("Failed to parse a column of a HapMap file")]
ParseError(String),
#[error("Improper Rate value, either NaN or negative ({0})")]
ImproperRate(String),
#[error("Chromosome key '{0}' does not exist in the recombination map")]
NoChrom(String),
#[error("HapMap file not sorted")]
HapMapNotSorted,
#[error("Lookup out of bounds ({0}:{1})")]
LookupOutOfBounds(String, Position),
#[error("Internal Error")]
InternalError(String),
#[error("Recombination map overuns sequence length for {0} ({1} > {2})")]
LengthMismatch(String, Position, Position),
#[error("GenomeMap Error: error updating GenomeMap")]
GenomeMapError(#[from] GenomeMapError),
}
pub fn read_seqlens(filepath: &str) -> Result<IndexMap<String, Position>, csv::Error> {
let mut rdr = csv::ReaderBuilder::new()
.delimiter(b'\t')
.has_headers(false)
.from_path(filepath)?;
let mut seqlens = IndexMap::new();
#[derive(Debug, Serialize, Deserialize, Default)]
struct SeqLenEntry {
chrom: String,
length: Position,
}
for result in rdr.deserialize() {
let record: SeqLenEntry = result?;
seqlens.insert(record.chrom, record.length);
}
Ok(seqlens)
}
#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)]
pub struct RateMap {
pub ends: Vec<Position>,
pub rates: Vec<RateFloat>,
pub map_pos: Vec<RateFloat>,
}
pub struct RateMapIter {
ends: std::vec::IntoIter<Position>,
rates: std::vec::IntoIter<RateFloat>,
map_pos: std::vec::IntoIter<RateFloat>,
}
impl Iterator for RateMapIter {
type Item = (Position, RateFloat, RateFloat);
fn next(&mut self) -> Option<Self::Item> {
match (self.ends.next(), self.rates.next(), self.map_pos.next()) {
(Some(end), Some(rate), Some(map_pos)) => Some((end, rate, map_pos)),
_ => None,
}
}
}
impl IntoIterator for RateMap {
type Item = (Position, RateFloat, RateFloat);
type IntoIter = RateMapIter;
fn into_iter(self) -> Self::IntoIter {
RateMapIter {
ends: self.ends.into_iter(),
rates: self.rates.into_iter(),
map_pos: self.map_pos.into_iter(),
}
}
}
impl RateMap {
pub fn new() -> Self {
Self {
ends: Vec::new(),
rates: Vec::new(),
map_pos: Vec::new(),
}
}
pub fn span(&self) -> Vec<Position> {
self.ends
.windows(2)
.map(|pair| {
assert!(
pair[1] >= pair[0],
"invalid positions encountered while calculating span: {:?}",
&pair
);
pair[1] - pair[0]
})
.collect()
}
pub fn mass(&self) -> Vec<RateFloat> {
self.rates
.iter()
.zip(self.span().iter())
.map(|(&rate, &span)| rate * span as RateFloat)
.collect()
}
pub fn calc_cumulative_mass(&mut self) {
let mass = self.mass();
let cumulative_sum: Vec<_> = mass
.iter()
.scan(0.0, |state, &x| {
*state += x;
Some(*state)
})
.collect();
self.map_pos = cumulative_sum;
}
pub fn total_map_length(&self) -> Option<&RateFloat> {
self.map_pos.last()
}
}
#[derive(Clone, Debug, Default, PartialEq)]
pub struct RecMap {
pub map: GenomeMap<RateMap>,
pub metadata: Option<Vec<String>>,
}
impl RecMap {
pub fn from_hapmap(
filepath: &str,
seqlens: &IndexMap<String, Position>,
) -> Result<RecMap, RecMapError> {
let mut input_file = InputFile::new(filepath);
let _has_metadata = input_file.collect_metadata("#", Some("Chr"))?;
let reader = input_file.continue_reading()?;
let mut rec_map: GenomeMap<RateMap> = GenomeMap::new();
let mut last_chrom: Option<String> = None;
let mut last_end: Option<Position> = None;
let mut has_fourth_column = false;
let mut map_positions: Vec<RateFloat> = Vec::new();
for result in reader.lines() {
let line = result?;
let fields: Vec<&str> = line.split_whitespace().collect();
let chrom = fields.first().ok_or(RecMapError::MissingField)?.to_string();
match last_chrom {
None => {}
Some(last) => {
if chrom != last {
if let Some(seq_len) = seqlens.get(&last) {
let chrom_entry = rec_map.entry_or_default(&last);
if let Some(&last_end) = chrom_entry.ends.last() {
if last_end != *seq_len {
chrom_entry.ends.push(*seq_len);
}
}
}
last_end = None;
}
}
}
last_chrom = Some(chrom.clone());
let end_str = fields.get(1).ok_or(RecMapError::MissingField)?;
let end: Position = end_str.parse().map_err(|_| {
RecMapError::ParseError(format!("Failed to parse end from string: {}", end_str))
})?;
match last_end {
None => Some(end),
Some(last_end) => {
if end >= last_end {
return Err(RecMapError::HapMapNotSorted);
}
Some(end)
}
};
let rate_str = fields.get(2).ok_or(RecMapError::MissingField)?;
let rate: RateFloat = rate_str.parse().map_err(|_| {
RecMapError::ParseError(format!("Failed to parse rate from string: {}", rate_str))
})?;
if rate.is_nan() || rate < 0.0 {
return Err(RecMapError::ImproperRate(format!("{}:{}", chrom, end)));
}
if let Some(map_pos_str) = fields.get(3) {
has_fourth_column = true;
let map_pos: RateFloat = map_pos_str.parse().map_err(|_| {
RecMapError::ParseError(format!(
"Failed to parse map position from string: {}",
map_pos_str
))
})?;
map_positions.push(map_pos);
}
let rate = CM_MB_CONVERSION * rate;
if let Some(chrom_entry) = rec_map.get_mut(&chrom) {
chrom_entry.ends.push(end);
chrom_entry.rates.push(rate);
} else {
let mut new_rate_map = RateMap::new();
if end != 0 {
new_rate_map.ends.push(0);
new_rate_map.rates.push(0.0);
}
new_rate_map.ends.push(end);
new_rate_map.rates.push(rate);
if has_fourth_column {
}
rec_map.insert(&chrom, new_rate_map)?;
}
}
if let Some(last) = last_chrom {
if let Some(seq_len) = seqlens.get(&last) {
let chrom_entry = rec_map.get_mut(&last).expect(
"internal error: please report at http://github.com/vsbuffalo/recmap/issues",
);
if chrom_entry.ends.is_empty() || chrom_entry.ends.last().unwrap() != seq_len {
if chrom_entry.ends.last().unwrap() >= seq_len {
let last_end = *chrom_entry.ends.last().unwrap();
return Err(RecMapError::LengthMismatch(last, last_end, *seq_len));
}
chrom_entry.ends.push(*seq_len);
}
}
}
let metadata = input_file.comments;
let mut rec_map = RecMap {
map: rec_map,
metadata,
};
rec_map.generate_map_positions();
Ok(rec_map)
}
pub fn len(&self) -> usize {
self.map.len()
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn iter(&self) -> impl Iterator<Item = (&String, &RateMap)> {
self.map.iter()
}
fn generate_map_positions(&mut self) {
for rate_map in self.map.values_mut() {
rate_map.calc_cumulative_mass();
}
}
pub fn interpolate_map_position(
&self,
name: &str,
position: Position,
) -> Result<RateFloat, RecMapError> {
let rate_map = self
.map
.get(name)
.ok_or(RecMapError::NoChrom(name.to_string()))?;
let ends = &rate_map.ends;
let interp_result = interp1d(&ends[0..ends.len() - 1],
&rate_map.map_pos,
position);
let interpolated_map_pos =
interp_result.ok_or(RecMapError::LookupOutOfBounds(name.to_string(), position))?;
Ok(interpolated_map_pos)
}
pub fn interpolate_map_positions(
&self,
chrom: &str,
positions: &[Position],
) -> Result<Vec<RateFloat>, RecMapError> {
let positions: Vec<RateFloat> = positions
.iter()
.map(|p| self.interpolate_map_position(chrom, *p))
.collect::<Result<Vec<_>, _>>()?;
Ok(positions)
}
pub fn recomb_dist_matrix(
&self,
chrom: &str,
positions_x: &[Position],
positions_y: &[Position],
haldane: bool,
rec_floor: Option<RateFloat>,
) -> Result<Array2<RateFloat>, RecMapError> {
let x_pos = self.interpolate_map_positions(chrom, positions_x)?;
let y_pos = self.interpolate_map_positions(chrom, positions_y)?;
Ok(recomb_dist_matrix(&x_pos, &y_pos, haldane, rec_floor))
}
pub fn write_hapmap(&self, filepath: Option<&str>) -> Result<(), RecMapError> {
let mut writer: Box<dyn Write> = match filepath {
Some(path) => {
let file = OutputFile::new(path, None);
file.writer()?
}
None => {
Box::new(std::io::stdout())
}
};
writeln!(writer, "Chromosome\tPosition(bp)\tRate(cM/Mb)\tMap(cM)")?;
for (chrom, rate_map) in self.map.iter() {
let n = rate_map.ends.len();
let ends = &rate_map.ends[1..n - 1];
for (i, end) in ends.iter().enumerate() {
let rate = rate_map.rates[i + 1];
let map_pos = rate_map.map_pos[i + 1];
writeln!(
writer,
"{}\t{}\t{}\t{}",
chrom,
end,
format_float(rate / CM_MB_CONVERSION),
format_float(map_pos),
)?;
}
}
Ok(())
}
pub fn write_tsv(&self, filepath: Option<&str>) -> Result<(), RecMapError> {
let mut writer: Box<dyn Write> = match filepath {
Some(path) => {
let file = OutputFile::new(path, None);
file.writer()?
}
None => {
Box::new(std::io::stdout())
}
};
for (chrom, rate_map) in self.map.iter() {
let ranges: Vec<(Position, Position)> = rate_map
.ends
.windows(2)
.map(|pair| (pair[0], pair[1]))
.collect();
for (i, range) in ranges.iter().enumerate() {
let rate = rate_map.rates[i];
let rate_rescaled: RateFloat = rate / CM_MB_CONVERSION;
let formatted_rate = format!("{:.1$e}", rate_rescaled, RATE_PRECISION);
writeln!(
writer,
"{}\t{}\t{}\t{}",
chrom, range.0, range.1, formatted_rate
)?;
}
}
Ok(())
}
}
pub fn format_float<T>(x: T) -> String
where
T: Float + LowerExp + Display,
{
let min = T::from(SCI_NOTATION_THRESH).unwrap();
let max = T::from(SCI_NOTATION_THRESH).unwrap();
if x.abs().log10() < -min || x.abs().log10() > max {
format!("{:.1$e}", x, RATE_PRECISION)
} else {
format!("{:.*}", RATE_PRECISION, x)
}
}
#[cfg(test)]
mod tests {
use super::Position;
use crate::{
numeric::{assert_float_eq, assert_floats_eq},
prelude::*,
};
use indexmap::IndexMap;
use tempfile::tempdir;
fn mock_seqlens() -> IndexMap<String, Position> {
let seqlens = indexmap::indexmap! {
"chr1".to_string() => 25,
"chr2".to_string() => 32,
"chr3".to_string() => 22,
};
seqlens
}
fn read_hapmap() -> RecMap {
let seqlens = mock_seqlens();
let rec_map = RecMap::from_hapmap("tests_data/test_hapmap.txt", &seqlens).unwrap();
rec_map
}
fn to_morgans(x: Vec<RateFloat>) -> Vec<RateFloat> {
x.iter().map(|v| v * CM_MB_CONVERSION).collect()
}
#[test]
fn test_read_hapmap() {
let rm = read_hapmap();
assert_eq!(rm.len(), 3);
assert!(!rm.is_empty());
dbg!(&rm.map.get("chr1").unwrap().map_pos);
let chr1_map = rm.map.get("chr1").unwrap();
assert_eq!(chr1_map.ends.len(), chr1_map.rates.len() + 1);
assert_eq!(chr1_map.ends, vec![0, 10, 15, 20, 25]);
assert_eq!(chr1_map.rates, to_morgans(vec![0.0, 1.10, 1.50, 4.33]));
let total_len = *chr1_map.total_map_length().unwrap();
assert_float_eq(total_len, 34.65 * CM_MB_CONVERSION, 1e-3);
assert_floats_eq(
&chr1_map.map_pos,
&to_morgans(vec![0.0, 5.5, 13., 34.65]),
1e-3,
);
}
#[test]
#[ignore = "for debugging"]
fn test_write_hapmap_local() {
let seqlens = mock_seqlens();
let rm = read_hapmap();
let filepath = "test_hapmap.txt";
rm.write_hapmap(Some(filepath)).unwrap();
let rm_readin = RecMap::from_hapmap(filepath, &seqlens).unwrap();
assert_eq!(rm_readin, rm);
}
#[test]
fn test_write_hapmap() {
let seqlens = mock_seqlens();
let rm = read_hapmap();
let dir = tempdir().unwrap();
let binding = dir.path().join("test_hapmap.txt");
let filepath = binding.to_str().unwrap();
rm.write_hapmap(Some(filepath)).unwrap();
let rm_readin = RecMap::from_hapmap(filepath, &seqlens).unwrap();
assert_eq!(rm_readin, rm);
}
}