bio/pattern_matching/pssm/
mod.rs

1// Copyright 2018 Kieran Hervold
2// Licensed under the MIT license (http://opensource.org/licenses/MIT)
3// This file may not be copied, modified, or distributed
4// except according to those terms.
5
6//! Create a weight matrix representing a set of aligned reference sequences
7//! that constitute a motif, and use this matrix to scan query sequences for
8//! occurrences of this motif.
9//! Complexity: O(n*m) for motif length n and query length m
10//!
11//! The position-specific scoring matrix (PSSM), aka position weight matrix (PWM),
12//! algorithm is implemented for both DNA and amino-acid sequences.
13//!
14//! # Examples
15//!
16//! use bio::pattern_matching::pssm::DNAMotif;
17//! let pssm = DNAMotif::from_seqs(vec![
18//!            b"AAAA".to_vec(),
19//!            b"AATA".to_vec(),
20//!            b"AAGA".to_vec(),
21//!            b"AAAA".to_vec(),
22//!        ].as_ref(), None).unwrap();
23//! let start_pos = pssm.score(b"CCCCCAATA").unwrap().loc;
24//! println!("motif found at position {}", start_pos);
25//!
26//! /* amino acid sequences are supported, too */
27//! use pssm::pattern_matching::pssm::ProtMotif;
28//! let pssm = ProtMotif::from_seqs(vec![
29//!            b"ARNNYM".to_vec(),
30//!            b"ARNRYM".to_vec(),
31//!            b"ARNNCM".to_vec(),
32//!            b"ARNNYM".to_vec(),
33//!        ].as_ref(), None).unwrap();
34
35use std::borrow::Borrow;
36
37use itertools::Itertools;
38use ndarray::prelude::*;
39
40mod dnamotif;
41pub mod errors;
42mod protmotif;
43
44pub use self::dnamotif::DNAMotif;
45pub use self::errors::{Error, Result};
46pub use self::protmotif::ProtMotif;
47
48/// default pseudocount - used to prevent 0 tallies
49pub const DEF_PSEUDO: f32 = 0.5;
50/// approximately zero
51pub const EPSILON: f32 = 1e-5;
52/// value representing an invalid monomer in lookup table
53pub const INVALID_MONO: u8 = 255;
54
55/// Represents motif score & location of match
56#[derive(Clone, PartialEq, PartialOrd, Debug, Serialize, Deserialize)]
57pub struct ScoredPos {
58    pub loc: usize,
59    pub sum: f32,
60    pub scores: Vec<f32>,
61}
62
63impl Default for ScoredPos {
64    fn default() -> ScoredPos {
65        ScoredPos {
66            loc: 0,
67            sum: f32::NEG_INFINITY,
68            scores: Vec::new(),
69        }
70    }
71}
72
73/// Trait containing code shared between DNA and protein implementations
74/// of the position-specific scoring matrix.
75pub trait Motif {
76    /// Lookup table mapping monomer -> index
77    const LK: [u8; 127] = [INVALID_MONO; 127];
78    /// All monomers, in order corresponding to lookup table
79    const MONOS: &'static [u8] = b"";
80    /// Monomer count - equal to length of `MONOS`
81    const MONO_CT: usize = 0;
82
83    /// Returns a weight matrix representing the sequences provided.
84    /// This code is shared by implementations of `from_seqs`
85    /// # Arguments
86    /// * `seqs` - sequences incorporated into motif
87    /// * `pseudos` - array slice with a pseudocount for each monomer;
88    ///    defaults to DEF_PSEUDO for all if None is supplied
89    ///
90    /// FIXME: pseudos should be an array of size MONO_CT, but that
91    /// is currently unsupported in stable rust
92    fn seqs_to_weights(seqs: &[Vec<u8>], _pseudos: Option<&[f32]>) -> Result<Array2<f32>> {
93        if _pseudos.is_some() {
94            if _pseudos.unwrap().len() != Self::MONO_CT {
95                return Err(Error::InvalidPseudos {
96                    expected: Self::MONO_CT as u8,
97                    received: _pseudos.unwrap().len() as u8,
98                });
99            }
100        }
101        let pseudos: Array1<f32> = match _pseudos {
102            Some(p) => Array::from_vec(p.iter().cloned().collect()),
103            None => Array::from_vec(vec![DEF_PSEUDO; Self::MONO_CT]),
104        };
105
106        if seqs.is_empty() {
107            return Err(Error::EmptyMotif);
108        }
109
110        let seqlen = seqs[0].len();
111        let mut counts = Array2::zeros((seqlen, Self::MONO_CT));
112        for mut row in counts.axis_iter_mut(Axis(0)) {
113            row += &pseudos;
114        }
115
116        for seq in seqs {
117            if seq.len() != seqlen {
118                return Err(Error::InconsistentLen);
119            }
120
121            for (mut ctr, base) in counts.axis_iter_mut(Axis(0)).zip(seq.iter()) {
122                ctr += &Self::incr(*base)?;
123            }
124        }
125        Ok(counts)
126    }
127
128    /// Returns the index of given monomer in the scores matrix using the lookup table `LK`
129    /// # Arguments
130    /// * `mono` - monomer, eg, b'A' for DNA or b'R' for protein
131    /// # Errors
132    /// * `Error::InvalidMonomer(mono)` - `mono` wasn't found in the lookup table
133    fn lookup(mono: u8) -> Result<usize> {
134        if mono >= 127 {
135            Err(Error::InvalidMonomer { mono })
136        } else {
137            let idx = Self::LK[mono as usize];
138            if idx == INVALID_MONO {
139                Err(Error::InvalidMonomer { mono })
140            } else {
141                Ok(idx as usize)
142            }
143        }
144    }
145
146    /// Returns an array of length MONO_CT summing to 1.  used to build a PSSM
147    /// from sequences, potentially including ambiguous monomers (eg, M is either A or C)
148    /// # Arguments
149    /// * `mono` - monomer, eg, b'A' for DNA or b'R' for protein
150    /// # Errors
151    /// * `Error::InvalidMonomer(mono)` - `mono` wasn't found in the lookup table
152    fn incr(mono: u8) -> Result<Array1<f32>>;
153
154    /// Returns the monomer associated with the given index; the reverse of `lookup`.
155    /// Returns INVALID_MONO if the index isn't associated with a monomer.
156    /// # Arguments
157    /// * `idx` - the index in question
158    fn rev_lk(idx: usize) -> u8;
159
160    /// Returns the length of motif
161    fn len(&self) -> usize;
162
163    fn is_empty(&self) -> bool {
164        self.len() == 0usize
165    }
166
167    /// Returns a representation of the motif using ambiguous codes.
168    /// Primarily useful for DNA motifs, where ambiguous codes are
169    /// common (eg, 'M' for 'A or C'); less so for proteins, where we
170    /// represent any position without a dominant amino acid as an 'X'
171    fn degenerate_consensus(&self) -> Vec<u8>;
172
173    /// Accessor - returns scores matrix
174    fn get_scores(&self) -> &Array2<f32>;
175
176    /// Return sum of "worst" base at each position
177    fn get_min_score(&self) -> f32;
178
179    /// Return sum of "best" base at each position
180    fn get_max_score(&self) -> f32;
181
182    /// Returns information content of a single position.
183    /// Used `info_content` method.
184    /// FIXME: this should be replaced with a CTFE ... or maybe just a constant
185    fn get_bits() -> f32;
186
187    /// Returns the un-normalized sum of matching bases, useful for comparing matches from
188    /// motifs of different lengths
189    ///
190    /// # Arguments
191    /// * `seq_it` - iterator representing the query sequence
192    ///
193    /// # Errors
194    /// * `Error::InvalidMonomer(mono)` - sequence `seq_it` contained invalid monomer `mono`
195    fn raw_score<C, T>(&self, seq_it: T) -> Result<(usize, f32, Vec<f32>)>
196    where
197        C: Borrow<u8>,
198        T: IntoIterator<Item = C>,
199    {
200        let pssm_len = self.len();
201
202        let mut best_start = 0;
203        let mut best_score = -1.0;
204        let mut best_m = Vec::new();
205        // we have to look at slices, so a simple iterator won't do
206        let seq = seq_it.into_iter().map(|c| *c.borrow()).collect_vec();
207        let scores = self.get_scores();
208        for start in 0..=seq.len() - pssm_len {
209            let m: Vec<f32> = match (0..pssm_len)
210                .map(|i| match Self::lookup(seq[start + i]) {
211                    Err(e) => Err(e),
212                    Ok(pos) => Ok(scores[[i, pos]]),
213                })
214                .collect()
215            {
216                Ok(m) => m,
217                Err(e) => return Err(e),
218            };
219            let tot = m.iter().sum();
220            if tot > best_score {
221                best_score = tot;
222                best_start = start;
223                best_m = m;
224            }
225        }
226        Ok((best_start, best_score, best_m))
227    }
228
229    /// Returns a `ScoredPos` struct representing the best match within the query sequence
230    /// see:
231    ///   MATCHTM: a tool for searching transcription factor binding sites in DNA sequences
232    ///   Nucleic Acids Res. 2003 Jul 1; 31(13): 3576–3579
233    ///   https://www.ncbi.nlm.nih.gov/pmc/articles/PMC169193/
234    ///
235    /// # Arguments
236    /// * `seq_it` - iterator representing the query sequence
237    ///
238    /// # Errors
239    /// * `Error::InvalidMonomer(mono)` - sequence `seq_it` contained invalid monomer `mono`
240    /// * `Error::QueryTooShort` - sequence `seq_id` was too short
241    ///
242    /// # Example
243    /// let pssm = DNAMotif::from_seqs(vec![
244    ///            b"AAAA".to_vec(),
245    ///            b"AATA".to_vec(),
246    ///            b"AAGA".to_vec(),
247    ///            b"AAAA".to_vec(),
248    ///        ].as_ref(), None).unwrap();
249    /// let start_pos = pssm.score(b"CCCCCAATA").unwrap().loc;
250    fn score<C, T>(&self, seq_it: T) -> Result<ScoredPos>
251    where
252        C: Borrow<u8>,
253        T: IntoIterator<Item = C>,
254    {
255        let pssm_len = self.len();
256        let seq = seq_it.into_iter().map(|c| *c.borrow()).collect_vec();
257        if seq.len() < pssm_len {
258            return Err(Error::QueryTooShort {
259                motif_len: pssm_len,
260                query_len: seq.len(),
261            });
262        }
263        let min_score = self.get_min_score();
264        let max_score = self.get_max_score();
265
266        if abs_diff_eq!(max_score, min_score) {
267            return Err(Error::NullMotif);
268        }
269
270        let (best_start, best_score, best_m) = self.raw_score(&seq)?;
271
272        Ok(ScoredPos {
273            loc: best_start,
274            sum: (best_score - min_score) / (max_score - min_score),
275            scores: best_m,
276        })
277    }
278
279    /// Returns a float representing the information content of a motif; roughly the
280    /// inverse of Shannon Entropy.
281    /// Adapted from the information content described here:
282    ///    https://en.wikipedia.org/wiki/Sequence_logo#Logo_creation
283    fn info_content(&self) -> f32 {
284        fn ent<'a, I>(probs: I) -> f32
285        where
286            I: Iterator<Item = &'a f32>,
287        {
288            probs
289                .map(|p| {
290                    if *p == 0.0 {
291                        0.0
292                    } else {
293                        -1.0 * *p * p.log(2.0)
294                    }
295                })
296                .sum()
297        }
298        let bits = Self::get_bits();
299        let scores = self.get_scores();
300        let mut tot = 0.0;
301        for row in scores.rows() {
302            tot += bits - ent(row.iter());
303        }
304        tot
305    }
306}