1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
// Copyright 2014-2016 Johannes Köster.
// Licensed under the MIT license (http://opensource.org/licenses/MIT)
// This file may not be copied, modified, or distributed
// except according to those terms.

//! Backward oracle matching algorithm.
//! Best-case complexity: O(n / m) with pattern of length m and text of length n.
//! Worst case complexity: O(n * m).
//!
//! # Example
//!
//! ```
//! use bio::pattern_matching::bom::BOM;
//! let text = b"ACGGCTAGGAAAAAGACTGAGGACTGAAAA";
//! let pattern = b"GAAAA";
//! let bom = BOM::new(pattern);
//! let occ: Vec<usize> = bom.find_all(text).collect();
//! assert_eq!(occ, [8, 25]);
//! ```

use crate::utils::TextSlice;
use std::borrow::Borrow;
use std::cmp::Ord;
use std::iter::repeat;

use vec_map::VecMap;

/// Backward oracle matching algorithm.
pub struct BOM {
    m: usize,
    table: Vec<VecMap<usize>>,
}

impl BOM {
    /// Create a new instance for a given pattern.
    pub fn new<C, P>(pattern: P) -> Self
    where
        C: Borrow<u8> + Ord,
        P: IntoIterator<Item = C>,
        P::IntoIter: DoubleEndedIterator + ExactSizeIterator + Clone,
    {
        let pattern = pattern.into_iter();
        let m = pattern.len();
        let maxsym = *pattern
            .clone()
            .max()
            .expect("Expecting non-empty pattern.")
            .borrow() as usize;
        let mut table: Vec<VecMap<usize>> = Vec::with_capacity(m);
        // init suffix table, initially all values unknown
        // suff[i] is the state in which the longest suffix of
        // pattern[..i+1] ends that does not end in i
        let mut suff: Vec<Option<usize>> = repeat(None).take(m + 1).collect();

        for (j, b) in pattern.rev().enumerate() {
            let i = j + 1;
            let a = *b.borrow() as usize;
            let mut delta = VecMap::with_capacity(maxsym);
            // reading symbol a leads into state i (this is an inner edge)
            delta.insert(a, i);
            // now, add edges for substrings ending with a
            let mut k = suff[i - 1];

            // for this iterate over the known suffixes until
            // reaching an edge labelled with a or the start
            while let Some(k_) = k {
                if table[k_].contains_key(a) {
                    break;
                }
                table[k_].insert(a, i);
                k = suff[k_];
            }

            // the longest suffix is either 0 or the state
            // reached by the edge labelled with a
            suff[i] = Some(match k {
                Some(k) => *table[k].get(a).unwrap(),
                None => 0,
            });

            table.push(delta);
        }

        BOM { m, table }
    }

    fn delta(&self, q: usize, a: u8) -> Option<usize> {
        if q >= self.table.len() {
            None
        } else {
            match self.table[q].get(a as usize) {
                Some(&q) => Some(q),
                None => None,
            }
        }
    }

    /// Find all matches of the pattern in the given text. Matches are returned as an iterator over start positions.
    pub fn find_all<'a>(&'a self, text: TextSlice<'a>) -> Matches<'_> {
        Matches {
            bom: self,
            text,
            window: self.m,
        }
    }
}

/// Iterator over start positions of matches.
pub struct Matches<'a> {
    bom: &'a BOM,
    text: TextSlice<'a>,
    window: usize,
}

impl<'a> Iterator for Matches<'a> {
    type Item = usize;

    fn next(&mut self) -> Option<usize> {
        while self.window <= self.text.len() {
            let (mut q, mut j) = (Some(0), 1);
            while j <= self.bom.m {
                match q {
                    Some(q_) => {
                        q = self.bom.delta(q_, self.text[self.window - j]);
                        j += 1;
                    }
                    None => break,
                }
            }
            // putative start position
            let i = self.window - self.bom.m;
            self.window += self.bom.m + 2 - j;
            if q.is_some() {
                // return match
                return Some(i);
            }
        }
        None
    }
}

#[cfg(test)]
mod tests {
    use super::BOM;
    use itertools::Itertools;

    #[test]
    fn test_delta() {
        let pattern = b"qnnnannan"; // reverse of nannannnq
        let bom = BOM::new(pattern);
        assert_eq!(bom.delta(0, b'n'), Some(1));
        assert_eq!(bom.delta(1, b'a'), Some(2));
        assert_eq!(bom.delta(2, b'n'), Some(3));
        assert_eq!(bom.delta(3, b'n'), Some(4));
        assert_eq!(bom.delta(4, b'a'), Some(5));
        assert_eq!(bom.delta(5, b'n'), Some(6));
        assert_eq!(bom.delta(6, b'n'), Some(7));
        assert_eq!(bom.delta(7, b'n'), Some(8));
        assert_eq!(bom.delta(8, b'q'), Some(9));

        assert_eq!(bom.delta(0, b'a'), Some(2));
        assert_eq!(bom.delta(0, b'q'), Some(9));
        assert_eq!(bom.delta(1, b'n'), Some(4));
        assert_eq!(bom.delta(1, b'q'), Some(9));
        assert_eq!(bom.delta(4, b'n'), Some(8));
        assert_eq!(bom.delta(4, b'q'), Some(9));
        bom.delta(9, b'a');
    }

    #[test]
    fn test_find_all() {
        let text = b"dhjalkjwqnnnannanaflkjdklfj";
        let pattern = b"qnnnannan";
        let bom = BOM::new(pattern);
        assert_eq!(bom.find_all(text).collect_vec(), [8]);
    }
}