bio/pattern_matching/pssm/mod.rs
1// Copyright 2018 Kieran Hervold
2// Licensed under the MIT license (http://opensource.org/licenses/MIT)
3// This file may not be copied, modified, or distributed
4// except according to those terms.
5
6//! Create a weight matrix representing a set of aligned reference sequences
7//! that constitute a motif, and use this matrix to scan query sequences for
8//! occurrences of this motif.
9//! Complexity: O(n*m) for motif length n and query length m
10//!
11//! The position-specific scoring matrix (PSSM), aka position weight matrix (PWM),
12//! algorithm is implemented for both DNA and amino-acid sequences.
13//!
14//! # Examples
15//!
16//! use bio::pattern_matching::pssm::DNAMotif;
17//! let pssm = DNAMotif::from_seqs(vec![
18//! b"AAAA".to_vec(),
19//! b"AATA".to_vec(),
20//! b"AAGA".to_vec(),
21//! b"AAAA".to_vec(),
22//! ].as_ref(), None).unwrap();
23//! let start_pos = pssm.score(b"CCCCCAATA").unwrap().loc;
24//! println!("motif found at position {}", start_pos);
25//!
26//! /* amino acid sequences are supported, too */
27//! use pssm::pattern_matching::pssm::ProtMotif;
28//! let pssm = ProtMotif::from_seqs(vec![
29//! b"ARNNYM".to_vec(),
30//! b"ARNRYM".to_vec(),
31//! b"ARNNCM".to_vec(),
32//! b"ARNNYM".to_vec(),
33//! ].as_ref(), None).unwrap();
34
35use std::borrow::Borrow;
36
37use itertools::Itertools;
38use ndarray::prelude::*;
39
40mod dnamotif;
41pub mod errors;
42mod protmotif;
43
44pub use self::dnamotif::DNAMotif;
45pub use self::errors::{Error, Result};
46pub use self::protmotif::ProtMotif;
47
48/// default pseudocount - used to prevent 0 tallies
49pub const DEF_PSEUDO: f32 = 0.5;
50/// approximately zero
51pub const EPSILON: f32 = 1e-5;
52/// value representing an invalid monomer in lookup table
53pub const INVALID_MONO: u8 = 255;
54
55/// Represents motif score & location of match
56#[derive(Clone, PartialEq, PartialOrd, Debug, Serialize, Deserialize)]
57pub struct ScoredPos {
58 pub loc: usize,
59 pub sum: f32,
60 pub scores: Vec<f32>,
61}
62
63impl Default for ScoredPos {
64 fn default() -> ScoredPos {
65 ScoredPos {
66 loc: 0,
67 sum: f32::NEG_INFINITY,
68 scores: Vec::new(),
69 }
70 }
71}
72
73/// Trait containing code shared between DNA and protein implementations
74/// of the position-specific scoring matrix.
75pub trait Motif {
76 /// Lookup table mapping monomer -> index
77 const LK: [u8; 127] = [INVALID_MONO; 127];
78 /// All monomers, in order corresponding to lookup table
79 const MONOS: &'static [u8] = b"";
80 /// Monomer count - equal to length of `MONOS`
81 const MONO_CT: usize = 0;
82
83 /// Returns a weight matrix representing the sequences provided.
84 /// This code is shared by implementations of `from_seqs`
85 /// # Arguments
86 /// * `seqs` - sequences incorporated into motif
87 /// * `pseudos` - array slice with a pseudocount for each monomer;
88 /// defaults to DEF_PSEUDO for all if None is supplied
89 ///
90 /// FIXME: pseudos should be an array of size MONO_CT, but that
91 /// is currently unsupported in stable rust
92 fn seqs_to_weights(seqs: &[Vec<u8>], _pseudos: Option<&[f32]>) -> Result<Array2<f32>> {
93 if _pseudos.is_some() {
94 if _pseudos.unwrap().len() != Self::MONO_CT {
95 return Err(Error::InvalidPseudos {
96 expected: Self::MONO_CT as u8,
97 received: _pseudos.unwrap().len() as u8,
98 });
99 }
100 }
101 let pseudos: Array1<f32> = match _pseudos {
102 Some(p) => Array::from_vec(p.iter().cloned().collect()),
103 None => Array::from_vec(vec![DEF_PSEUDO; Self::MONO_CT]),
104 };
105
106 if seqs.is_empty() {
107 return Err(Error::EmptyMotif);
108 }
109
110 let seqlen = seqs[0].len();
111 let mut counts = Array2::zeros((seqlen, Self::MONO_CT));
112 for mut row in counts.axis_iter_mut(Axis(0)) {
113 row += &pseudos;
114 }
115
116 for seq in seqs {
117 if seq.len() != seqlen {
118 return Err(Error::InconsistentLen);
119 }
120
121 for (mut ctr, base) in counts.axis_iter_mut(Axis(0)).zip(seq.iter()) {
122 ctr += &Self::incr(*base)?;
123 }
124 }
125 Ok(counts)
126 }
127
128 /// Returns the index of given monomer in the scores matrix using the lookup table `LK`
129 /// # Arguments
130 /// * `mono` - monomer, eg, b'A' for DNA or b'R' for protein
131 /// # Errors
132 /// * `Error::InvalidMonomer(mono)` - `mono` wasn't found in the lookup table
133 fn lookup(mono: u8) -> Result<usize> {
134 if mono >= 127 {
135 Err(Error::InvalidMonomer { mono })
136 } else {
137 let idx = Self::LK[mono as usize];
138 if idx == INVALID_MONO {
139 Err(Error::InvalidMonomer { mono })
140 } else {
141 Ok(idx as usize)
142 }
143 }
144 }
145
146 /// Returns an array of length MONO_CT summing to 1. used to build a PSSM
147 /// from sequences, potentially including ambiguous monomers (eg, M is either A or C)
148 /// # Arguments
149 /// * `mono` - monomer, eg, b'A' for DNA or b'R' for protein
150 /// # Errors
151 /// * `Error::InvalidMonomer(mono)` - `mono` wasn't found in the lookup table
152 fn incr(mono: u8) -> Result<Array1<f32>>;
153
154 /// Returns the monomer associated with the given index; the reverse of `lookup`.
155 /// Returns INVALID_MONO if the index isn't associated with a monomer.
156 /// # Arguments
157 /// * `idx` - the index in question
158 fn rev_lk(idx: usize) -> u8;
159
160 /// Returns the length of motif
161 fn len(&self) -> usize;
162
163 fn is_empty(&self) -> bool {
164 self.len() == 0usize
165 }
166
167 /// Returns a representation of the motif using ambiguous codes.
168 /// Primarily useful for DNA motifs, where ambiguous codes are
169 /// common (eg, 'M' for 'A or C'); less so for proteins, where we
170 /// represent any position without a dominant amino acid as an 'X'
171 fn degenerate_consensus(&self) -> Vec<u8>;
172
173 /// Accessor - returns scores matrix
174 fn get_scores(&self) -> &Array2<f32>;
175
176 /// Return sum of "worst" base at each position
177 fn get_min_score(&self) -> f32;
178
179 /// Return sum of "best" base at each position
180 fn get_max_score(&self) -> f32;
181
182 /// Returns information content of a single position.
183 /// Used `info_content` method.
184 /// FIXME: this should be replaced with a CTFE ... or maybe just a constant
185 fn get_bits() -> f32;
186
187 /// Returns the un-normalized sum of matching bases, useful for comparing matches from
188 /// motifs of different lengths
189 ///
190 /// # Arguments
191 /// * `seq_it` - iterator representing the query sequence
192 ///
193 /// # Errors
194 /// * `Error::InvalidMonomer(mono)` - sequence `seq_it` contained invalid monomer `mono`
195 fn raw_score<C, T>(&self, seq_it: T) -> Result<(usize, f32, Vec<f32>)>
196 where
197 C: Borrow<u8>,
198 T: IntoIterator<Item = C>,
199 {
200 let pssm_len = self.len();
201
202 let mut best_start = 0;
203 let mut best_score = -1.0;
204 let mut best_m = Vec::new();
205 // we have to look at slices, so a simple iterator won't do
206 let seq = seq_it.into_iter().map(|c| *c.borrow()).collect_vec();
207 let scores = self.get_scores();
208 for start in 0..=seq.len() - pssm_len {
209 let m: Vec<f32> = match (0..pssm_len)
210 .map(|i| match Self::lookup(seq[start + i]) {
211 Err(e) => Err(e),
212 Ok(pos) => Ok(scores[[i, pos]]),
213 })
214 .collect()
215 {
216 Ok(m) => m,
217 Err(e) => return Err(e),
218 };
219 let tot = m.iter().sum();
220 if tot > best_score {
221 best_score = tot;
222 best_start = start;
223 best_m = m;
224 }
225 }
226 Ok((best_start, best_score, best_m))
227 }
228
229 /// Returns a `ScoredPos` struct representing the best match within the query sequence
230 /// see:
231 /// MATCHTM: a tool for searching transcription factor binding sites in DNA sequences
232 /// Nucleic Acids Res. 2003 Jul 1; 31(13): 3576–3579
233 /// https://www.ncbi.nlm.nih.gov/pmc/articles/PMC169193/
234 ///
235 /// # Arguments
236 /// * `seq_it` - iterator representing the query sequence
237 ///
238 /// # Errors
239 /// * `Error::InvalidMonomer(mono)` - sequence `seq_it` contained invalid monomer `mono`
240 /// * `Error::QueryTooShort` - sequence `seq_id` was too short
241 ///
242 /// # Example
243 /// let pssm = DNAMotif::from_seqs(vec![
244 /// b"AAAA".to_vec(),
245 /// b"AATA".to_vec(),
246 /// b"AAGA".to_vec(),
247 /// b"AAAA".to_vec(),
248 /// ].as_ref(), None).unwrap();
249 /// let start_pos = pssm.score(b"CCCCCAATA").unwrap().loc;
250 fn score<C, T>(&self, seq_it: T) -> Result<ScoredPos>
251 where
252 C: Borrow<u8>,
253 T: IntoIterator<Item = C>,
254 {
255 let pssm_len = self.len();
256 let seq = seq_it.into_iter().map(|c| *c.borrow()).collect_vec();
257 if seq.len() < pssm_len {
258 return Err(Error::QueryTooShort {
259 motif_len: pssm_len,
260 query_len: seq.len(),
261 });
262 }
263 let min_score = self.get_min_score();
264 let max_score = self.get_max_score();
265
266 if abs_diff_eq!(max_score, min_score) {
267 return Err(Error::NullMotif);
268 }
269
270 let (best_start, best_score, best_m) = self.raw_score(&seq)?;
271
272 Ok(ScoredPos {
273 loc: best_start,
274 sum: (best_score - min_score) / (max_score - min_score),
275 scores: best_m,
276 })
277 }
278
279 /// Returns a float representing the information content of a motif; roughly the
280 /// inverse of Shannon Entropy.
281 /// Adapted from the information content described here:
282 /// https://en.wikipedia.org/wiki/Sequence_logo#Logo_creation
283 fn info_content(&self) -> f32 {
284 fn ent<'a, I>(probs: I) -> f32
285 where
286 I: Iterator<Item = &'a f32>,
287 {
288 probs
289 .map(|p| {
290 if *p == 0.0 {
291 0.0
292 } else {
293 -1.0 * *p * p.log(2.0)
294 }
295 })
296 .sum()
297 }
298 let bits = Self::get_bits();
299 let scores = self.get_scores();
300 let mut tot = 0.0;
301 for row in scores.rows() {
302 tot += bits - ent(row.iter());
303 }
304 tot
305 }
306}