bndm/lib.rs
1// Copyright (C) 2019 - 2024 Wilfred Bos
2// Licensed under the MIT license. See the LICENSE file for the terms and conditions.
3
4//! # Backward Nondeterministic Dawg Matching (BNDM)
5//!
6//! BNDM is an optimized string search algorithm designed for efficiently locating
7//! patterns within a text. The BNDM algorithm was invented by Gonzalo Navarro and Mathieu
8//! Raffinot.
9//!
10//! The BNDM algorithm works by preprocessing the pattern to generate a set of bitmasks.
11//! These bitmasks are then used to efficiently scan the text for occurrences of the
12//! pattern.
13//!
14//! Unlike the traditional BNDM algorithm, this implementation is optimized by scanning the
15//! first 2 bytes outside the inner loop. Additionally, it does not have the limitation that
16//! the pattern size is limited to the word size of the CPU. It's possible to search for
17//! larger patterns than the word size and still benefit from the performance that BNDM offers.
18//!
19//! One of the key features of this implementation is its support for wildcard search.
20//! This algorithm is ideally suited for this, as it does not impact the performance of the
21//! pattern matching itself. The wildcard search only slightly affects the indexing, ensuring
22//! that the overall efficiency of the pattern matching remains high.
23//!
24//! ## BndmConfig
25//!
26//! The `BndmConfig` struct is used to store the preprocessed pattern and the bitmasks.
27//!
28//! ## Functions
29//!
30//! The main function provided by this module is `find_pattern()`, which searches for the
31//! pattern in a given text. It returns the index of the first occurrence of the pattern
32//! in the text, or `None` if the pattern is not found.
33//!
34//! ## Usage
35//!
36//! Here is an example of how to use this module to search for a pattern in a text:
37//!
38//! ```rust
39//! use bndm::{BndmConfig, find_pattern};
40//!
41//! let source = b"The quick brown fox jumps over the lazy dog";
42//! let pattern = b"jumps";
43//! let config = BndmConfig::new(pattern, None);
44//! let index = find_pattern(source, &config);
45//! assert_eq!(index, Some(20));
46//! ```
47//!
48//! With wildcard:
49//!
50//! ```rust
51//! use bndm::{BndmConfig, find_pattern};
52//!
53//! let source = b"The quick brown fox jumps over the lazy dog";
54//! let pattern = b"ju??s";
55//! let config = BndmConfig::new(pattern, Some(b'?'));
56//! let index = find_pattern(source, &config);
57//! assert_eq!(index, Some(20));
58//! ```
59
60use std::cmp::min;
61
62const MASKS_TABLE_SIZE: usize = 256;
63const WORD_SIZE_IN_BITS: usize = usize::BITS as usize;
64
65/// The `BndmConfig` struct is used to store the pattern and the bitmasks.
66pub struct BndmConfig {
67 /// An array of bitmasks, one for each possible byte value.
68 pub masks: [usize; MASKS_TABLE_SIZE],
69
70 /// An optional wildcard character. If provided, this character in the pattern
71 /// can match any character in the text.
72 pub wildcard: Option<u8>,
73
74 /// The pattern to search for in the text.
75 pub pattern: Vec<u8>
76}
77
78impl BndmConfig {
79 /// Creates a new `BndmConfig` instance.
80 ///
81 /// # Arguments
82 ///
83 /// * `search_pattern` - The pattern to search for in the text.
84 /// * `wildcard` - An optional wildcard character. If provided, this character in the pattern
85 /// can match any character in the text.
86 ///
87 /// # Returns
88 ///
89 /// * `BndmConfig` - A new `BndmConfig` instance.
90 ///
91 /// # Usage
92 ///
93 /// Without wildcard:
94 ///
95 /// ```rust
96 /// use bndm::BndmConfig;
97 ///
98 /// let pattern = b"jumps";
99 /// let wildcard = None;
100 /// let config = BndmConfig::new(pattern, wildcard);
101 /// ```
102 ///
103 /// With wildcard:
104 ///
105 /// ```rust
106 /// use bndm::BndmConfig;
107 ///
108 /// let pattern = b"ju??s";
109 /// let wildcard = b'?';
110 /// let config = BndmConfig::new(pattern, Some(wildcard));
111 /// ```
112 pub fn new(search_pattern: &[u8], wildcard: Option<u8>) -> BndmConfig {
113 let len = get_pattern_length_within_cpu_word(search_pattern.len());
114
115 BndmConfig {
116 masks: generate_masks(&search_pattern[..len], wildcard),
117 wildcard,
118 pattern: search_pattern.to_owned()
119 }
120 }
121}
122
123/// Searches for the pattern in the source string using the BNDM algorithm.
124///
125/// The function takes a source string and a `BndmConfig` as input. The `BndmConfig`
126/// contains the preprocessed pattern and the bitmasks. The function returns the index
127/// of the first occurrence of the pattern in the text, or `None` if the pattern is not
128/// found.
129///
130/// # Arguments
131///
132/// * `source` - The source string to search for the pattern.
133/// * `config` - The configuration for the BNDM search, which includes the pattern and the
134/// bitmasks.
135///
136/// # Returns
137///
138/// * `Option<usize>` - Returns the index of the first occurrence of the pattern in the text,
139/// or `None` if the pattern is not found.
140///
141/// # Usage
142///
143/// Without wildcard:
144///
145/// ```rust
146/// use bndm::{BndmConfig, find_pattern};
147///
148/// let source = b"The quick brown fox jumps over the lazy dog";
149/// let pattern = b"jumps";
150/// let config = BndmConfig::new(pattern, None);
151/// let index = find_pattern(source, &config);
152/// assert_eq!(index, Some(20));
153/// ```
154///
155/// With wildcard:
156///
157/// ```rust
158/// use bndm::{BndmConfig, find_pattern};
159///
160/// let source = b"The quick brown fox jumps over the lazy dog";
161/// let pattern = b"ju??s";
162/// let config = BndmConfig::new(pattern, Some(b'?'));
163/// let index = find_pattern(source, &config);
164/// assert_eq!(index, Some(20));
165/// ```
166pub fn find_pattern(source: &[u8], config: &BndmConfig) -> Option<usize> {
167 match config.pattern.len() {
168 0 => None,
169 1 => config.wildcard
170 .map_or(false, |w| w == config.pattern[0]).then_some(0)
171 .or_else(|| source.iter().position(|&s| s == config.pattern[0])),
172 _ => find_pattern_bndm(source, config)
173 }
174}
175
176fn find_pattern_bndm(source: &[u8], config: &BndmConfig) -> Option<usize> {
177 if config.pattern.len() > source.len() {
178 return None;
179 }
180
181 let len = get_pattern_length_within_cpu_word(config.pattern.len()) - 1;
182 let end = source.len() - config.pattern.len();
183 let df = 1 << len;
184 let mut i = 0;
185
186 while i <= end {
187 let mut j = len;
188 let mut last = len;
189
190 let mut d = get_mask(source, config, i + j);
191 d = (d << 1) & get_mask(source, config, i + j - 1);
192 while d != 0 {
193 j -= 1;
194 if d & df != 0 {
195 if j == 0 {
196 if find_remaining(source, config, i + WORD_SIZE_IN_BITS) {
197 return Some(i);
198 }
199 j += 1;
200 }
201 last = j;
202 }
203 d = (d << 1) & get_mask(source, config, i + j - 1);
204 }
205
206 i += last;
207 }
208 None
209}
210
211fn get_mask(source: &[u8], config: &BndmConfig, index: usize) -> usize {
212 unsafe {
213 *config.masks.get_unchecked(*source.get_unchecked(index) as usize)
214 }
215}
216
217/// Checks if the remaining part of the pattern matches the source string.
218///
219/// This function is used when the pattern is longer than the CPU word size.
220/// It checks the remaining part of the pattern (after the first CPU word size characters)
221/// against the corresponding part of the source string.
222///
223/// # Arguments
224///
225/// * `source` - The source string to search for the pattern.
226/// * `config` - The configuration for the BNDM search, which includes the pattern and the
227/// wildcard character.
228/// * `start_index` - The index in the source string from where the remaining part of the
229/// pattern should be checked.
230///
231/// # Returns
232///
233/// * `bool` - Returns `true` if the remaining part of the pattern matches the corresponding part of the source string, `false` otherwise.
234fn find_remaining(source: &[u8], config: &BndmConfig, start_index: usize) -> bool {
235 config.pattern.iter().skip(WORD_SIZE_IN_BITS).enumerate().all(|(index, &pattern_byte)| unsafe {
236 *source.get_unchecked(start_index + index) == pattern_byte || config.wildcard.map_or(false, |w| pattern_byte == w)
237 })
238}
239
240fn get_pattern_length_within_cpu_word(search_pattern_length: usize) -> usize {
241 min(search_pattern_length, WORD_SIZE_IN_BITS)
242}
243
244fn calculate_wildcard_mask(search_pattern: &[u8], wildcard: Option<u8>) -> usize {
245 wildcard.map_or(0, |wildcard| search_pattern.iter()
246 .fold(0, |mask, &pattern_byte| (mask << 1) | (pattern_byte == wildcard) as usize))
247}
248
249fn generate_masks(search_pattern: &[u8], wildcard: Option<u8>) -> [usize; MASKS_TABLE_SIZE] {
250 let default_mask = calculate_wildcard_mask(search_pattern, wildcard);
251 let mut masks = [default_mask; MASKS_TABLE_SIZE];
252
253 search_pattern.iter().rev().enumerate()
254 .for_each(|(i, &pattern_byte)| masks[pattern_byte as usize] |= 1 << i);
255
256 masks
257}
258
259#[cfg(test)]
260#[path = "./bndm_test.rs"]
261mod bndm_test;