1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
//! This module contains the implementation of a classic `PairHMM` as described in
//! Durbin, R., Eddy, S., Krogh, A., & Mitchison, G. (1998). Biological Sequence Analysis.
//! Current Topics in Genome Analysis 2008. http://doi.org/10.1017/CBO9780511790492.
//! It also contains a modified variant `HomopolyPairHMM` with additional homopolymer states suited
//! for dealing with homopolymer runs in sequencing as often encountered in Oxford Nanopore
//! sequencing data.
//!
//! Traits defined in this module apply to both `PairHMM` and `HomopolyPairHMM`.
//!
//! # Examples
//! ```
//! use approx::assert_relative_eq;
//! use bio::stats::pairhmm::{
//! EmissionParameters, GapParameters, PairHMM, StartEndGapParameters, XYEmission,
//! };
//! use bio::stats::{LogProb, Prob};
//! use num_traits::Zero;
//!
//! // Two sequences for which we'd like to know if they are likely related.
//! let x = b"AAAA";
//! let y = b"AAAT";
//!
//! // For this example, we disallow gaps, so all probabilities are zero here.
//! struct GapParams;
//! impl GapParameters for GapParams {
//! fn prob_gap_x(&self) -> LogProb {
//! LogProb::zero()
//! }
//! fn prob_gap_y(&self) -> LogProb {
//! LogProb::zero()
//! }
//! fn prob_gap_x_extend(&self) -> LogProb {
//! LogProb::zero()
//! }
//! fn prob_gap_y_extend(&self) -> LogProb {
//! LogProb::zero()
//! }
//! }
//! let gap_params = GapParams;
//!
//! // The PairHMM instance stores the gap params, since these are constant.
//! let mut pairhmm = PairHMM::new(&gap_params);
//!
//! // However, emission parameters depend on the actual sequences
//! struct EmissionParams {
//! x: &'static [u8],
//! y: &'static [u8],
//! }
//!
//! const PROB_SUBSTITUTION: f64 = 0.1;
//! const PROB_NO_SUBSTITUION: f64 = 1. - PROB_SUBSTITUTION;
//! impl EmissionParameters for EmissionParams {
//! fn prob_emit_xy(&self, i: usize, j: usize) -> XYEmission {
//! if self.x[i] == self.y[j] {
//! // if two bases match, emit a Match!
//! XYEmission::Match(LogProb::from(Prob(PROB_NO_SUBSTITUION)))
//! } else {
//! // otherwise emit a Mismatch!
//! // Note that the probability here is `mismatch / 3`, since probabilities should sum
//! // to 1 and there are 3 possible mismatch configurations
//! XYEmission::Mismatch(LogProb::from(Prob(PROB_SUBSTITUTION / 3.)))
//! }
//! }
//!
//! // In this example, emitting x[i] is as likely as not observing a mismatch.
//! // In more complex cases, this might e.g. depend on base qualities reported by the sequencer
//! fn prob_emit_x(&self, i: usize) -> LogProb {
//! LogProb::from(Prob(PROB_NO_SUBSTITUION))
//! }
//! fn prob_emit_y(&self, j: usize) -> LogProb {
//! LogProb::from(Prob(PROB_NO_SUBSTITUION))
//! }
//!
//! fn len_x(&self) -> usize {
//! self.x.len()
//! }
//! fn len_y(&self) -> usize {
//! self.y.len()
//! }
//! }
//!
//! // Since we want to do global alignment here, disallow free start and end gaps in x.
//! struct GlobalAlignmentMode;
//! impl StartEndGapParameters for GlobalAlignmentMode {
//! fn free_start_gap_x(&self) -> bool {
//! false
//! }
//! fn free_end_gap_x(&self) -> bool {
//! false
//! }
//! }
//!
//! // Finally calculate the probability of relatedness between x and y!
//! let prob_related = pairhmm.prob_related(&EmissionParams { x, y }, &GlobalAlignmentMode, None);
//!
//! // … and compare it to a rough estimation
//! let prob_expected = LogProb::from(Prob(PROB_NO_SUBSTITUION.powi(3) * PROB_SUBSTITUTION / 3.));
//! assert_relative_eq!(*prob_related, *prob_expected, epsilon = 1e-5);
//! ```
pub use homopolypairhmm::{BaseSpecificHopParameters, HomopolyPairHMM, HopParameters};
pub use pairhmm::PairHMM;
use crate::stats::LogProb;
mod homopolypairhmm;
mod pairhmm;
// traits common to pairhmm implementations
/// Trait for parametrization of `PairHMM` emission behavior.
pub trait EmissionParameters {
/// Emission probability for `(x[i], y[j])`.
/// Returns a tuple with probability and a boolean indicating whether emissions match
/// (e.g., are the same DNA alphabet letter).
fn prob_emit_xy(&self, i: usize, j: usize) -> XYEmission;
/// Emission probability for `(x[i], -)`.
fn prob_emit_x(&self, i: usize) -> LogProb;
/// Emission probability for `(-, y[j])`.
fn prob_emit_y(&self, j: usize) -> LogProb;
fn len_x(&self) -> usize;
fn len_y(&self) -> usize;
}
/// Trait needed for the `HomopolyPairHMM`, because its implementation details
/// depend on the actual bases to distinguish between Match states.
pub trait Emission {
/// Base emitted at `i` in sequence `x`.
/// Should be one of b'A', b'C', b'G' or b'T'.
fn emission_x(&self, i: usize) -> u8;
/// Base emitted at `i` in sequence `y`.
/// Should be one of b'A', b'C', b'G' or b'T'.
fn emission_y(&self, j: usize) -> u8;
}
/// Trait for parametrization of `PairHMM` gap behavior.
pub trait GapParameters {
/// Probability to open gap in x.
fn prob_gap_x(&self) -> LogProb;
/// Probability to open gap in y.
fn prob_gap_y(&self) -> LogProb;
/// Probability to extend gap in x.
fn prob_gap_x_extend(&self) -> LogProb;
/// Probability to extend gap in y.
fn prob_gap_y_extend(&self) -> LogProb;
}
/// Trait for parametrization of `PairHMM` start and end gap behavior.
/// This trait can be used to implement global and semiglobal alignments.
///
/// * global: methods return `false` and `LogProb::ln_zero()`.
/// * semiglobal: methods return `true` and `LogProb::ln_one()`.
pub trait StartEndGapParameters {
/// Probability to start at `x[i]`. This can be left unchanged if you use `free_start_gap_x` and
/// `free_end_gap_x`.
#[inline]
#[allow(unused_variables)]
fn prob_start_gap_x(&self, i: usize) -> LogProb {
if self.free_start_gap_x() {
LogProb::ln_one()
} else {
// For global alignment, this has to return 0.0.
LogProb::ln_zero()
}
}
/// Allow free start gap in x.
fn free_start_gap_x(&self) -> bool;
/// Allow free end gap in x.
fn free_end_gap_x(&self) -> bool;
}
#[derive(Copy, Clone, PartialEq, PartialOrd, Debug, Serialize, Deserialize)]
pub enum XYEmission {
Match(LogProb),
Mismatch(LogProb),
}
impl XYEmission {
pub fn prob(&self) -> LogProb {
match *self {
XYEmission::Match(p) => p,
XYEmission::Mismatch(p) => p,
}
}
pub fn is_match(&self) -> bool {
match *self {
XYEmission::Match(_) => true,
XYEmission::Mismatch(_) => false,
}
}
}