1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
// Copyright 2018 Kieran Hervold
// Licensed under the MIT license (http://opensource.org/licenses/MIT)
// This file may not be copied, modified, or distributed
// except according to those terms.

//! Create a weight matrix representing a set of aligned reference sequences
//! that constitute a motif, and use this matrix to scan query sequences for
//! occurrences of this motif.
//! Complexity: O(n*m) for motif length n and query length m
//!
//! The position-specific scoring matrix (PSSM), aka position weight matrix (PWM),
//! algorithm is implemented for both DNA and amino-acid sequences.
//!
//! # Examples
//!
//! use bio::pattern_matching::pssm::DNAMotif;
//! let pssm = DNAMotif::from_seqs(vec![
//!            b"AAAA".to_vec(),
//!            b"AATA".to_vec(),
//!            b"AAGA".to_vec(),
//!            b"AAAA".to_vec(),
//!        ].as_ref(), None).unwrap();
//! let start_pos = pssm.score(b"CCCCCAATA").unwrap().loc;
//! println!("motif found at position {}", start_pos);
//!
//! /* amino acid sequences are supported, too */
//! use pssm::pattern_matching::pssm::ProtMotif;
//! let pssm = ProtMotif::from_seqs(vec![
//!            b"ARNNYM".to_vec(),
//!            b"ARNRYM".to_vec(),
//!            b"ARNNCM".to_vec(),
//!            b"ARNNYM".to_vec(),
//!        ].as_ref(), None).unwrap();

use std::borrow::Borrow;
use std::char;
use std::f32::NEG_INFINITY;

use itertools::Itertools;
use ndarray::prelude::Array2;

mod dnamotif;
mod protmotif;

pub use self::dnamotif::DNAMotif;
pub use self::protmotif::ProtMotif;

/// default pseudocount - used to prevent 0 tallies
pub const DEF_PSEUDO: f32 = 0.5;
/// approximately zero
pub const EPSILON: f32 = 1e-5;
/// value representing an invalid monomer in lookup table
pub const INVALID_MONO: u8 = 255;

/// Errors
quick_error! {
    #[derive(Debug, PartialEq)]
    pub enum PSSMError {
        QueryTooShort(motif_len: usize, query_len: usize) {
            description("query cannot be shorter than motif")
            display("query length {} is shorter than motif length {}", query_len, motif_len)
        }
        InconsistentLen {
            description("attempted to build a motif from sequences with mismatched lengths")
            display("mismatched sequence lengths")
        }
        InvalidMonomer(mono: u8) {
            description("unknown monomer, eg, DNA base other than ATGC")
            display("monomer '{}' is invalid", char::from(*mono))
        }
        NullMotif {
            description("a motif in which every monomer is equally likely at every position will result in a divide-by-zero exception")
            display("information-free motif")
        }
        EmptyMotif {
            description("attempted to create a motif from zero sequences")
            display("motif cannot be created from zero sequences")
        }
        InvalidPseudos(expected: u8, received: u8) {
            description("pseudo-score array should have on entry per monomer")
            display("expected pseudo-score array of length {}; got {}", expected, received)
        }
    }
}

/// Represents motif score & location of match
#[derive(Debug, Clone, PartialEq)]
pub struct ScoredPos {
    pub loc: usize,
    pub sum: f32,
    pub scores: Vec<f32>,
}

impl Default for ScoredPos {
    fn default() -> ScoredPos {
        ScoredPos {
            loc: 0,
            sum: NEG_INFINITY,
            scores: Vec::new(),
        }
    }
}

/// Trait containing code shared between DNA and protein implementations
/// of the position-specific scoring matrix.
pub trait Motif {
    /// Lookup table mapping monomer -> index
    const LK: [u8; 127] = [INVALID_MONO; 127];
    /// All monomers, in order corresponding to lookup table
    const MONOS: &'static [u8] = b"";
    /// Monomer count - equal to length of `MONOS`
    const MONO_CT: usize = 0;

    /// Returns a weight matrix representing the sequences provided.
    /// This code is shared by implementations of `from_seqs`
    /// # Arguments
    /// * `seqs` - sequences incorporated into motif
    /// * `pseudos` - array slice with a pseudocount for each monomer;
    ///    defaults to DEF_PSEUDO for all if None is supplied
    ///
    /// FIXME: pseudos should be an array of size MONO_CT, but that
    /// is currently unsupported
    fn seqs_to_weights(
        seqs: &Vec<Vec<u8>>,
        _pseudos: Option<&[f32]>,
    ) -> Result<Array2<f32>, PSSMError> {
        let p1 = vec![DEF_PSEUDO; Self::MONO_CT];
        let pseudos = match _pseudos {
            Some(ref p2) => p2,
            None => p1.as_slice(),
        };

        if pseudos.len() != Self::MONO_CT {
            return Err(PSSMError::InvalidPseudos(
                Self::MONO_CT as u8,
                pseudos.len() as u8,
            ));
        }

        if seqs.is_empty() {
            return Err(PSSMError::EmptyMotif);
        }

        let seqlen = seqs[0].len();
        let mut counts = Array2::zeros((seqlen, Self::MONO_CT));
        for i in 0..seqlen {
            for base in 0..Self::MONO_CT {
                counts[[i, base]] = pseudos[base];
            }
        }

        for seq in seqs.iter() {
            if seq.len() != seqlen {
                return Err(PSSMError::InconsistentLen);
            }

            for (idx, base) in seq.iter().enumerate() {
                match Self::lookup(*base) {
                    Err(e) => return Err(e),
                    Ok(pos) => counts[[idx, pos]] += 1.0,
                }
            }
        }
        Ok(counts)
    }

    /// Returns the index of given monomer in the scores matrix using the lookup table `LK`
    /// # Arguments
    /// * `mono` - monomer, eg, b'A' for DNA or b'R' for protein
    /// # Errors
    /// * `PSSMError::InvalidMonomer(mono)` - `mono` wasn't found in the lookup table
    fn lookup(mono: u8) -> Result<usize, PSSMError> {
        if mono >= 127 {
            Err(PSSMError::InvalidMonomer(mono))
        } else {
            let idx = Self::LK[mono as usize];
            if idx == INVALID_MONO {
                Err(PSSMError::InvalidMonomer(mono))
            } else {
                Ok(idx as usize)
            }
        }
    }

    /// Returns the monomer associated with the given index; the reverse of `lookup`.
    /// Returns INVALID_MONO if the index isn't associated with a monomer.
    /// # Arguments
    /// * `idx` - the index in question
    fn rev_lk(idx: usize) -> u8;

    /// Returns the length of motif
    fn len(&self) -> usize;

    /// Returns a representation of the motif using ambiguous codes.
    /// Primarily useful for DNA motifs, where ambiguous codes are
    /// common (eg, 'M' for 'A or C'); less so for proteins, where we
    /// represent any position without a dominant amino acid as an 'X'
    fn degenerate_consensus(&self) -> Vec<u8>;

    /// Accessor - returns scores matrix
    fn get_scores(&self) -> &Array2<f32>;

    /// Return sum of "worst" base at each position
    fn get_min_score(&self) -> f32;

    /// Return sum of "best" base at each position
    fn get_max_score(&self) -> f32;

    /// Returns information content of a single position.
    /// Used `info_content` method.
    /// FIXME: this should be replaced with a CTFE ... or maybe just a constant
    fn get_bits() -> f32;

    /// Returns the un-normalized sum of matching bases, useful for comparing matches from
    /// motifs of different lengths
    ///
    /// # Arguments
    /// * `seq_it` - iterator representing the query sequence
    ///
    /// # Errors
    /// * `PSSMError::InvalidMonomer(mono)` - sequence `seq_it` contained invalid monomer `mono`
    fn raw_score<C, T>(&self, seq_it: T) -> Result<(usize, f32, Vec<f32>), PSSMError>
    where
        C: Borrow<u8>,
        T: IntoIterator<Item = C>,
    {
        let pssm_len = self.len();

        let mut best_start = 0;
        let mut best_score = -1.0;
        let mut best_m = Vec::new();
        // we have to look at slices, so a simple iterator won't do
        let seq = seq_it.into_iter().map(|c| *c.borrow()).collect_vec();
        let scores = self.get_scores();
        for start in 0..=seq.len() - pssm_len {
            let m: Vec<f32> = match (0..pssm_len)
                .map(|i| match Self::lookup(seq[start + i]) {
                    Err(e) => Err(e),
                    Ok(pos) => Ok(scores[[i, pos]]),
                })
                .collect()
            {
                Ok(m) => m,
                Err(e) => return Err(e),
            };
            let tot = m.iter().sum();
            if tot > best_score {
                best_score = tot;
                best_start = start;
                best_m = m;
            }
        }
        Ok((best_start, best_score, best_m))
    }

    /// Returns a `ScoredPos` struct representing the best match within the query sequence
    /// see:
    ///   MATCHTM: a tool for searching transcription factor binding sites in DNA sequences
    ///   Nucleic Acids Res. 2003 Jul 1; 31(13): 3576–3579
    ///   https://www.ncbi.nlm.nih.gov/pmc/articles/PMC169193/
    ///
    /// # Arguments
    /// * `seq_it` - iterator representing the query sequence
    ///
    /// # Errors
    /// * `PSSMError::InvalidMonomer(mono)` - sequence `seq_it` contained invalid monomer `mono`
    /// * `PSSMError::QueryTooShort` - sequence `seq_id` was too short
    ///
    /// # Example
    /// let pssm = DNAMotif::from_seqs(vec![
    ///            b"AAAA".to_vec(),
    ///            b"AATA".to_vec(),
    ///            b"AAGA".to_vec(),
    ///            b"AAAA".to_vec(),
    ///        ].as_ref(), None).unwrap();
    /// let start_pos = pssm.score(b"CCCCCAATA").unwrap().loc;
    fn score<C, T>(&self, seq_it: T) -> Result<ScoredPos, PSSMError>
    where
        C: Borrow<u8>,
        T: IntoIterator<Item = C>,
    {
        let pssm_len = self.len();
        let seq = seq_it.into_iter().map(|c| *c.borrow()).collect_vec();
        if seq.len() < pssm_len {
            return Err(PSSMError::QueryTooShort(pssm_len, seq.len()));
        }
        let min_score = self.get_min_score();
        let max_score = self.get_max_score();

        if max_score == min_score {
            return Err(PSSMError::NullMotif);
        }

        let (best_start, best_score, best_m) = self.raw_score(&seq)?;

        Ok(ScoredPos {
            loc: best_start,
            sum: (best_score - min_score) / (max_score - min_score),
            scores: best_m,
        })
    }

    /// Returns a float representing the information content of a motif; roughly the
    /// inverse of Shannon Entropy.
    /// Adapted from the information content described here:
    ///    https://en.wikipedia.org/wiki/Sequence_logo#Logo_creation
    fn info_content(&self) -> f32 {
        fn ent<'a, I>(probs: I) -> f32
        where
            I: Iterator<Item = &'a f32>,
        {
            probs
                .map(|p| {
                    if *p == 0.0 {
                        0.0
                    } else {
                        -1.0 * *p * p.log(2.0)
                    }
                })
                .sum()
        }
        let bits = Self::get_bits();
        let scores = self.get_scores();
        let mut tot = 0.0;
        for row in scores.genrows() {
            tot += bits - ent(row.iter());
        }
        tot
    }
}