use serde::{Deserialize, Serialize};
use crate::alphabet::{AlphabetEncoding, DNA, DnaSymbol, Protein, ProteinSymbol};
pub trait ModelCalculation<A: AlphabetEncoding> {
type Acc;
fn init() -> Self::Acc;
fn accumulate(acc: &mut Self::Acc, a: A::Symbol, b: A::Symbol) -> Self::Acc;
fn finalize(acc: &Self::Acc, aln_len: usize, n_comparable: usize) -> f64;
}
#[inline(always)]
pub fn pairwise_distance<M, A>(s1: &[A::Symbol], s2: &[A::Symbol]) -> f64
where
M: ModelCalculation<A>,
A: AlphabetEncoding,
{
let aln_len = s1.len();
let mut acc = M::init();
let mut n_comparable: usize = 0;
for k in 0..aln_len {
if !A::is_gap(s1[k]) && !A::is_gap(s2[k]) {
n_comparable += 1;
}
acc = M::accumulate(&mut acc, s1[k], s2[k]);
}
M::finalize(&acc, aln_len, n_comparable)
}
pub struct PDiff;
impl ModelCalculation<DNA> for PDiff {
type Acc = usize;
fn init() -> Self::Acc {
0
}
fn accumulate(acc: &mut Self::Acc, a: DnaSymbol, b: DnaSymbol) -> Self::Acc {
if a != b && a != DnaSymbol::Gap && b != DnaSymbol::Gap {
*acc += 1;
}
*acc
}
fn finalize(acc: &Self::Acc, _aln_len: usize, n_comparable: usize) -> f64 {
if n_comparable == 0 {
return 0.0;
}
*acc as f64 / n_comparable as f64
}
}
impl ModelCalculation<Protein> for PDiff {
type Acc = usize;
fn init() -> Self::Acc {
0
}
fn accumulate(acc: &mut Self::Acc, a: ProteinSymbol, b: ProteinSymbol) -> Self::Acc {
if a != b && a != ProteinSymbol::Gap && b != ProteinSymbol::Gap {
*acc += 1;
}
*acc
}
fn finalize(acc: &Self::Acc, _aln_len: usize, n_comparable: usize) -> f64 {
if n_comparable == 0 {
return 0.0;
}
*acc as f64 / n_comparable as f64
}
}
pub struct JukesCantor;
impl ModelCalculation<DNA> for JukesCantor {
type Acc = usize;
fn init() -> Self::Acc {
0
}
fn accumulate(acc: &mut Self::Acc, a: DnaSymbol, b: DnaSymbol) -> Self::Acc {
if a != b && a != DnaSymbol::Gap && b != DnaSymbol::Gap {
*acc += 1;
}
*acc
}
fn finalize(acc: &Self::Acc, _aln_len: usize, n_comparable: usize) -> f64 {
if n_comparable == 0 {
return 0.0;
}
let p = *acc as f64 / n_comparable as f64;
if p >= 0.75 {
f64::INFINITY } else {
-0.75 * (1.0 - (4.0 / 3.0) * p).ln()
}
}
}
pub struct Kimura2P;
impl ModelCalculation<DNA> for Kimura2P {
type Acc = (usize, usize);
fn init() -> Self::Acc {
(0, 0)
}
fn accumulate(acc: &mut Self::Acc, a: DnaSymbol, b: DnaSymbol) -> Self::Acc {
if a != b && a != DnaSymbol::Gap && b != DnaSymbol::Gap {
match (a, b) {
(DnaSymbol::A, DnaSymbol::G)
| (DnaSymbol::G, DnaSymbol::A)
| (DnaSymbol::C, DnaSymbol::T)
| (DnaSymbol::T, DnaSymbol::C) => acc.0 += 1, _ => acc.1 += 1, }
}
*acc
}
fn finalize(acc: &Self::Acc, _aln_len: usize, n_comparable: usize) -> f64 {
if n_comparable == 0 {
return 0.0;
}
let (ti, tv) = *acc;
let p = ti as f64 / n_comparable as f64;
let q = tv as f64 / n_comparable as f64;
let denom1 = 1.0 - 2.0 * p - q;
let denom2 = 1.0 - 2.0 * q;
if denom1 <= 0.0 || denom2 <= 0.0 {
f64::INFINITY } else {
-0.5 * denom1.ln() - 0.25 * denom2.ln()
}
}
}
pub struct Poisson;
impl ModelCalculation<Protein> for Poisson {
type Acc = usize;
fn init() -> Self::Acc {
0
}
fn accumulate(acc: &mut Self::Acc, a: ProteinSymbol, b: ProteinSymbol) -> Self::Acc {
if a != b && a != ProteinSymbol::Gap && b != ProteinSymbol::Gap {
*acc += 1;
}
*acc
}
fn finalize(acc: &Self::Acc, _aln_len: usize, n_comparable: usize) -> f64 {
if n_comparable == 0 {
return 0.0;
}
let p = *acc as f64 / n_comparable as f64;
if p >= 1.0 {
f64::INFINITY } else {
-(1.0 - p).ln()
}
}
}
#[derive(Clone, Debug, ts_rs::TS, Serialize, Deserialize)]
#[ts(export, export_to = "../../wasm/types/lib_types.ts")]
#[cfg_attr(feature = "cli", derive(clap::ValueEnum))]
pub enum SubstitutionModel {
PDiff,
JukesCantor,
Kimura2P,
Poisson,
}
#[cfg(test)]
mod tests {
use super::*;
use crate::alphabet::{DNA, Protein, dna, protein};
#[test]
fn test_pdiff_dna_two_differences() {
assert_eq!(
pairwise_distance::<PDiff, DNA>(&dna!("ACGTA"), &dna!("AGGTC")),
0.4
);
}
#[test]
fn test_pdiff_dna_identical() {
let s = dna!("ACGT");
assert_eq!(pairwise_distance::<PDiff, DNA>(&s, &s), 0.0);
}
#[test]
fn test_pdiff_dna_gapped_positions_excluded_from_denominator() {
assert!(
(pairwise_distance::<PDiff, DNA>(&dna!("A-G"), &dna!("TCG")) - 1.0 / 2.0).abs() < 1e-12
);
}
#[test]
fn test_pdiff_protein_one_difference() {
assert_eq!(
pairwise_distance::<PDiff, Protein>(&protein!("ACDE"), &protein!("ACDF")),
0.25
);
}
#[test]
fn test_pdiff_protein_identical() {
let s = protein!("ARN");
assert_eq!(pairwise_distance::<PDiff, Protein>(&s, &s), 0.0);
}
#[test]
fn test_pdiff_protein_gaps_not_counted_as_differences() {
assert_eq!(
pairwise_distance::<PDiff, Protein>(&protein!("A-D"), &protein!("ARD")),
0.0
);
}
#[test]
fn test_jukes_cantor_dna() {
let p = 0.4_f64;
let expected = -0.75 * (1.0 - (4.0 / 3.0) * p).ln();
assert!(
(pairwise_distance::<JukesCantor, DNA>(&dna!("ACGTA"), &dna!("AGGTC")) - expected)
.abs()
< 1e-6
);
}
#[test]
fn test_jukes_cantor_identical() {
let s = dna!("ACGT");
assert_eq!(pairwise_distance::<JukesCantor, DNA>(&s, &s), 0.0);
}
#[test]
fn test_jukes_cantor_saturated_returns_infinity() {
assert_eq!(
pairwise_distance::<JukesCantor, DNA>(&dna!("AAAA"), &dna!("CGTC")),
f64::INFINITY
);
}
#[test]
fn test_kimura2p_identical() {
let s = dna!("ACGT");
assert_eq!(pairwise_distance::<Kimura2P, DNA>(&s, &s), 0.0);
}
#[test]
fn test_kimura2p_pure_transitions() {
assert_eq!(
pairwise_distance::<Kimura2P, DNA>(&dna!("ACAC"), &dna!("GTGT")),
f64::INFINITY
);
}
#[test]
fn test_kimura2p_pure_transversions() {
assert_eq!(
pairwise_distance::<Kimura2P, DNA>(&dna!("AAGG"), &dna!("CTCT")),
f64::INFINITY
);
}
#[test]
fn test_kimura2p_mixed() {
let p = 0.25_f64;
let q = 0.25_f64;
let expected = -0.5 * (1.0 - 2.0 * p - q).ln() - 0.25 * (1.0 - 2.0 * q).ln();
assert!(
(pairwise_distance::<Kimura2P, DNA>(&dna!("AATT"), &dna!("GCTT")) - expected).abs()
< 1e-12
);
}
#[test]
fn test_kimura2p_saturated_transversions_returns_infinity() {
assert_eq!(
pairwise_distance::<Kimura2P, DNA>(&dna!("AG"), &dna!("CT")),
f64::INFINITY
);
}
#[test]
fn test_poisson_identical() {
let s = protein!("ARND");
assert_eq!(pairwise_distance::<Poisson, Protein>(&s, &s), 0.0);
}
#[test]
fn test_poisson_one_difference() {
let expected = -(1.0_f64 - 0.25).ln();
assert!(
(pairwise_distance::<Poisson, Protein>(&protein!("ARND"), &protein!("ARNE"))
- expected)
.abs()
< 1e-12
);
}
#[test]
fn test_poisson_fully_different_returns_infinity() {
assert_eq!(
pairwise_distance::<Poisson, Protein>(&protein!("AR"), &protein!("DE")),
f64::INFINITY
);
}
#[test]
fn test_poisson_gaps_not_counted_as_differences() {
assert_eq!(
pairwise_distance::<Poisson, Protein>(&protein!("A-N"), &protein!("ARN")),
0.0
);
}
}