use anyhow::{bail, Result};
use fxread::{initialize_reader, Record};
use ndarray::{Array1, Array2, ArrayBase, Axis, Dim, ViewRepr};
use ndarray_stats::{DeviationExt, EntropyExt, QuantileExt};
#[derive(Debug, Copy, Clone)]
pub enum Offset {
Forward(usize),
Reverse(usize),
}
impl Offset {
#[must_use]
pub fn index(&self) -> &usize {
match self {
Self::Forward(index) | Self::Reverse(index) => index,
}
}
pub fn is_forward(&self) -> bool {
matches!(self, Self::Forward(_))
}
pub fn is_reverse(&self) -> bool {
matches!(self, Self::Reverse(_))
}
}
fn get_sequence_size(reader: &mut dyn Iterator<Item = Record>) -> usize {
reader.next().expect("empty reader").seq().len()
}
fn base_map(c: u8) -> Option<usize> {
match c {
b'A' => Some(0),
b'C' => Some(1),
b'G' => Some(2),
b'T' => Some(3),
_ => None,
}
}
fn position_counts(reader: &mut dyn Iterator<Item = Record>) -> Array2<f64> {
let size = get_sequence_size(reader);
reader.fold(Array2::<f64>::zeros((size, 4)), |mut posmat, record| {
record
.seq()
.iter()
.enumerate()
.take(size)
.map(|(idx, c)| (idx, base_map(*c)))
.for_each(|(idx, jdx)| {
if let Some(j) = jdx {
posmat[[idx, j]] += 1.;
} else {
posmat[[idx, 0]] += 1.;
posmat[[idx, 1]] += 1.;
posmat[[idx, 2]] += 1.;
posmat[[idx, 3]] += 1.;
};
});
posmat
})
}
fn normalize_counts(matrix: Array2<f64>) -> Array2<f64> {
let (x, y) = matrix.dim();
let sums = matrix.sum_axis(Axis(1));
let norm = matrix / sums.broadcast((y, x)).expect("incompatible sizes").t();
norm
}
fn positional_entropy(reader: &mut dyn Iterator<Item = Record>) -> Array1<f64> {
let pos_prob = normalize_counts(position_counts(reader));
pos_prob.map_axis(Axis(1), |axis| {
axis.entropy().expect("Unexpected Negatives in Calculation")
})
}
fn slice_array(
array: &Array1<f64>,
x: usize,
y: usize,
) -> ArrayBase<ViewRepr<&f64>, Dim<[usize; 1]>> {
array.slice(ndarray::s![x..y])
}
fn windowed_mse(array1: &Array1<f64>, array2: &Array1<f64>) -> Array1<f64> {
let size = array2.len() - array1.len() + 1;
(0..size).map(|idx| (idx, idx + array1.len())).fold(
Array1::<f64>::zeros(size),
|mut arr, (x, y)| {
arr[x] += array1
.mean_sq_err(&slice_array(array2, x, y))
.expect("unexpected error");
arr
},
)
}
fn assign_offset(mse_forward: &Array1<f64>, mse_reverse: &Array1<f64>) -> Offset {
let argmin_forward = match mse_forward.argmin() {
Ok(m) => m,
Err(why) => panic!("Unexpected minmax error in entropy: {}", why),
};
let argmin_reverse = match mse_reverse.argmin() {
Ok(m) => m,
Err(why) => panic!("Unexpected minmax error in entropy: {}", why),
};
let min_forward = match mse_forward.min() {
Ok(m) => m,
Err(why) => panic!("Unexpected minmax error in entropy: {}", why),
};
let min_reverse = match mse_reverse.min() {
Ok(m) => m,
Err(why) => panic!("Unexpected minmax error in entropy: {}", why),
};
if min_forward < min_reverse {
Offset::Forward(argmin_forward)
} else {
Offset::Reverse(argmin_reverse)
}
}
fn minimize_mse(reference: &Array1<f64>, comparison: &Array1<f64>) -> Result<Offset> {
if comparison.len() < reference.len() {
bail!("Sequences in reference library are larger than the sequences in input.\nConsider reducing the length of your reference sequences (i.e. extracting the variable region of the sgRNA or reducing the length of the adapters.)")
}
let rev_comparison = comparison.iter().rev().copied().collect();
let mse_forward = windowed_mse(reference, comparison);
let mse_reverse = windowed_mse(reference, &rev_comparison);
Ok(assign_offset(&mse_forward, &mse_reverse))
}
pub fn entropy_offset(
library_path: &str,
input_paths: &[String],
subsample: usize,
) -> Result<Offset> {
let mut reference = initialize_reader(library_path)?;
let mut comparison = initialize_reader(&input_paths[0])?.take(subsample);
let reference_entropy = positional_entropy(&mut reference);
let comparison_entropy = positional_entropy(&mut comparison);
let index = minimize_mse(&reference_entropy, &comparison_entropy)?;
Ok(index)
}
pub fn entropy_offset_group(
library_path: &str,
input_paths: &[String],
subsample: usize,
) -> Result<Vec<Offset>> {
let mut reference = initialize_reader(library_path)?;
let reference_entropy = positional_entropy(&mut reference);
let result = input_paths
.iter()
.map(|x| {
initialize_reader(x)
.unwrap_or_else(|_| panic!("Unable to open file: {}", x))
.take(subsample)
})
.map(|mut x| positional_entropy(&mut x))
.map(|x| minimize_mse(&reference_entropy, &x));
let mut results = vec![];
for x in result {
match x {
Ok(y) => results.push(y),
Err(why) => bail!("Error in entropy offset calculation:\n\n{}", why),
}
}
Ok(results)
}
#[cfg(test)]
mod test {
use crate::offsetter::base_map;
use super::{
get_sequence_size, minimize_mse, normalize_counts, position_counts, positional_entropy,
Offset,
};
use fxread::{FastaReader, FastxRead, Record};
use ndarray::Array1;
fn reader() -> Box<dyn FastxRead<Item = Record>> {
let sequence: &'static [u8] = b">seq.0\nACT\n>seq.1\nACC\n>seq.2\nACT\n";
Box::new(FastaReader::new(sequence))
}
fn reader_with_n() -> Box<dyn FastxRead<Item = Record>> {
let sequence: &'static [u8] = b">seq.0\nACT\n>seq.1\nACC\n>seq.2\nACT\n>seq.3\nACN\n";
Box::new(FastaReader::new(sequence))
}
fn offset_reader() -> Box<dyn FastxRead<Item = Record>> {
let sequence: &'static [u8] = b">seq.0\nAACAAACT\n>seq.1\nAACAAACC\n>seq.2\nAACAAACT\n";
Box::new(FastaReader::new(sequence))
}
fn rc_offset_reader() -> Box<dyn FastxRead<Item = Record>> {
let sequence: &'static [u8] = b">seq.0\nAGTTTGTT\n>seq.1\nGGTTTGTT\n>seq.2\nAGTTTGTT\n";
Box::new(FastaReader::new(sequence))
}
#[test]
fn minimization() {
let arr = Array1::linspace(0., 10., 11);
let brr = Array1::linspace(10., 20., 100);
match minimize_mse(&arr, &brr).unwrap() {
Offset::Forward(x) => assert_eq!(x, 0),
Offset::Reverse(_) => assert!(false),
}
}
#[test]
fn undersized_minimization() {
let arr = Array1::linspace(0., 10., 11);
let brr = Array1::linspace(10., 20., 5);
assert!(minimize_mse(&arr, &brr).is_err());
}
#[test]
fn sequence_size() {
let mut reader = reader();
let size = get_sequence_size(&mut reader);
assert_eq!(size, 3);
assert_eq!(reader.count(), 2);
}
#[test]
fn positional_counts() {
let posmat = position_counts(&mut reader());
let expected = ndarray::array![
[2.0, 0.0, 0.0, 0.0],
[0.0, 2.0, 0.0, 0.0],
[0.0, 1.0, 0.0, 1.0]
];
let diff = posmat - expected;
assert_eq!(diff.sum(), 0.);
}
#[test]
fn normalize() {
let test = ndarray::array![
[2.0, 0.0, 0.0, 0.0],
[0.0, 2.0, 0.0, 0.0],
[0.0, 1.0, 0.0, 1.0]
];
let norm = normalize_counts(test);
let expected = ndarray::array![
[1.0, 0.0, 0.0, 0.0],
[0.0, 1.0, 0.0, 0.0],
[0.0, 0.5, 0.0, 0.5]
];
let diff = norm - expected;
assert_eq!(diff.sum(), 0.);
}
#[test]
fn offset() {
let mut reference = reader();
let mut comparison = offset_reader();
let reference_entropy = positional_entropy(&mut reference);
let comparison_entropy = positional_entropy(&mut comparison);
let index = match minimize_mse(&reference_entropy, &comparison_entropy).unwrap() {
Offset::Forward(x) => x,
Offset::Reverse(_) => panic!("Unexpected reverse"),
};
assert_eq!(index, 5);
}
#[test]
fn rc_offset() {
let mut reference = reader();
let mut comparison = rc_offset_reader();
let reference_entropy = positional_entropy(&mut reference);
let comparison_entropy = positional_entropy(&mut comparison);
let index = match minimize_mse(&reference_entropy, &comparison_entropy).unwrap() {
Offset::Forward(_) => panic!("Unexpected forward"),
Offset::Reverse(x) => x,
};
assert_eq!(index, 5);
}
#[test]
fn test_offset_enum() {
let offset = Offset::Forward(5);
assert_eq!(offset.index(), &5);
assert_eq!(offset.is_forward(), true);
assert_eq!(offset.is_reverse(), false);
let offset = Offset::Reverse(5);
assert_eq!(offset.index(), &5);
assert_eq!(offset.is_forward(), false);
assert_eq!(offset.is_reverse(), true);
}
#[test]
fn test_base_map() {
assert_eq!(base_map(b'A'), Some(0));
assert_eq!(base_map(b'C'), Some(1));
assert_eq!(base_map(b'G'), Some(2));
assert_eq!(base_map(b'T'), Some(3));
assert_eq!(base_map(b'N'), None);
assert_eq!(base_map(b'X'), None);
}
#[test]
fn test_position_counts_with_n() {
let posmat = position_counts(&mut reader_with_n());
let expected = ndarray::array![
[3.0, 0.0, 0.0, 0.0],
[0.0, 3.0, 0.0, 0.0],
[2.0, 1.0, 2.0, 1.0]
];
let diff = posmat - expected;
assert_eq!(diff.sum(), 0.);
}
}