texcraft_stdext/algorithms/
substringsearch.rs

1//! Knuth–Morris–Pratt substring search algorithm
2//!
3//! This module contains an optimal in time algorithm for finding a substring
4//! in a string. By 'string' and 'substring' we mean a vector of elements of the same type.
5//! The algorithm is due to
6//! [Knuth, Morris and Pratt](https://en.wikipedia.org/wiki/Knuth%E2%80%93Morris%E2%80%93Pratt_algorithm).
7//!
8//! The API for this module is based on two assumptions:
9//!
10//! - There may be multiple searches for the same substring in different strings.
11//!
12//! - The elements of the string may be generated on the demand as the search progresses.
13//!    That is, the full string is not necessarily known at the start.
14//!
15//! ## Example
16//!
17//! To use the algorithm, a [Matcher] is first created.
18//! This factory takes ownership of the substring.
19//! On initialization the matcher computes a number of internal
20//! quantities which make the subsequent matching fast.
21//! These quantities depend on the substring, so mutating the substring
22//! after it has been passed to the matcher is statically prevented.
23//!
24//! To match a string, a new [Search] instance is created by calling [Matcher::start]. Elements
25//! of the string are passed in one at a time to the `next` method
26//! of the matcher.
27//! If the substring has length `m` and matches the last `m` elements that
28//! have been passed in, the `next` method returns `true`.
29//! Otherwise it returns `false`.
30//! The matcher may be used to find multiple instances of the substring
31//! in the same string.
32//!
33//! ```
34//! # use texcraft_stdext::algorithms::substringsearch::Matcher;
35//! # use texcraft_stdext::collections::nevec::Nevec;
36//! # use texcraft_stdext::nevec;
37//!
38//! let substring = nevec![2, 3, 2];
39//! let matcher = Matcher::new(substring);
40//! let mut search = matcher.start();
41//! assert_eq![search.next(&1), false];
42//! assert_eq![search.next(&2), false];
43//! assert_eq![search.next(&3), false];
44//! assert_eq![search.next(&2), true];
45//! assert_eq![search.next(&3), false];
46//! assert_eq![search.next(&2), true];
47//! ```
48//!
49use crate::collections::nevec::Nevec;
50
51/// Data structure used to match a specific substring in many strings.
52#[derive(Debug, Clone)]
53#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
54pub struct Matcher<T: PartialEq> {
55    substring: Nevec<T>,
56    prefix_fn: Nevec<usize>,
57}
58
59impl<T: PartialEq> Matcher<T> {
60    /// Create a new matcher that searches for the provide substring.
61    pub fn new(substring: Nevec<T>) -> Matcher<T> {
62        let mut prefix_fn = Nevec::with_capacity(0, substring.len());
63        let mut k = 0;
64        for i in 1..substring.len() {
65            while k > 0 && substring[k] != substring[i] {
66                k = prefix_fn[k - 1];
67            }
68            if substring[k] == substring[i] {
69                k += 1;
70            }
71            prefix_fn.push(k);
72        }
73
74        Matcher {
75            substring,
76            prefix_fn,
77        }
78    }
79
80    /// Start a new substring search.
81    pub fn start(&self) -> Search<T> {
82        Search {
83            factory: self,
84            q: 0,
85        }
86    }
87
88    /// Get an immutable reference to the underlying substring.
89    //
90    // Obtaining a mutable reference is not supported as internal details of
91    // the matcher factory rely on the substring remaining constant.
92    pub fn substring(&self) -> &Nevec<T> {
93        &self.substring
94    }
95
96    /// Retake ownership of the underlying substring.
97    pub fn take_substring(self) -> Nevec<T> {
98        self.substring
99    }
100}
101
102/// Data structure used to search for specific substring within a specific string.
103pub struct Search<'a, T: PartialEq> {
104    factory: &'a Matcher<T>,
105    q: usize,
106}
107
108impl<'a, T: PartialEq> Search<'a, T> {
109    /// Provide the next element of the string to the matcher.
110    /// This returns true if the last `m` elements of the string match the substring, where
111    /// `m` is the length of the substring.
112    pub fn next(&mut self, tail: &T) -> bool {
113        while self.q > 0 && &self.factory.substring[self.q] != tail {
114            self.q = self.factory.prefix_fn[self.q - 1];
115        }
116        if &self.factory.substring[self.q] == tail {
117            self.q += 1;
118        }
119        if self.q == self.factory.substring.len() {
120            self.q = self.factory.prefix_fn[self.q - 1];
121            return true;
122        }
123        false
124    }
125}