bndm/
lib.rs

1// Copyright (C) 2019 - 2024 Wilfred Bos
2// Licensed under the MIT license. See the LICENSE file for the terms and conditions.
3
4//! # Backward Nondeterministic Dawg Matching (BNDM)
5//!
6//! BNDM is an optimized string search algorithm designed for efficiently locating
7//! patterns within a text. The BNDM algorithm was invented by Gonzalo Navarro and Mathieu
8//! Raffinot.
9//!
10//! The BNDM algorithm works by preprocessing the pattern to generate a set of bitmasks.
11//! These bitmasks are then used to efficiently scan the text for occurrences of the
12//! pattern.
13//!
14//! Unlike the traditional BNDM algorithm, this implementation is optimized by scanning the
15//! first 2 bytes outside the inner loop. Additionally, it does not have the limitation that
16//! the pattern size is limited to the word size of the CPU. It's possible to search for
17//! larger patterns than the word size and still benefit from the performance that BNDM offers.
18//!
19//! One of the key features of this implementation is its support for wildcard search.
20//! This algorithm is ideally suited for this, as it does not impact the performance of the
21//! pattern matching itself. The wildcard search only slightly affects the indexing, ensuring
22//! that the overall efficiency of the pattern matching remains high.
23//!
24//! ## BndmConfig
25//!
26//! The `BndmConfig` struct is used to store the preprocessed pattern and the bitmasks.
27//!
28//! ## Functions
29//!
30//! The main function provided by this module is `find_pattern()`, which searches for the
31//! pattern in a given text. It returns the index of the first occurrence of the pattern
32//! in the text, or `None` if the pattern is not found.
33//!
34//! ## Usage
35//!
36//! Here is an example of how to use this module to search for a pattern in a text:
37//!
38//! ```rust
39//! use bndm::{BndmConfig, find_pattern};
40//!
41//! let source = b"The quick brown fox jumps over the lazy dog";
42//! let pattern = b"jumps";
43//! let config = BndmConfig::new(pattern, None);
44//! let index = find_pattern(source, &config);
45//! assert_eq!(index, Some(20));
46//! ```
47//!
48//! With wildcard:
49//!
50//! ```rust
51//! use bndm::{BndmConfig, find_pattern};
52//!
53//! let source = b"The quick brown fox jumps over the lazy dog";
54//! let pattern = b"ju??s";
55//! let config = BndmConfig::new(pattern, Some(b'?'));
56//! let index = find_pattern(source, &config);
57//! assert_eq!(index, Some(20));
58//! ```
59
60use std::cmp::min;
61
62const MASKS_TABLE_SIZE: usize = 256;
63const WORD_SIZE_IN_BITS: usize = usize::BITS as usize;
64
65/// The `BndmConfig` struct is used to store the pattern and the bitmasks.
66pub struct BndmConfig {
67    /// An array of bitmasks, one for each possible byte value.
68    pub masks: [usize; MASKS_TABLE_SIZE],
69
70    /// An optional wildcard character. If provided, this character in the pattern
71    /// can match any character in the text.
72    pub wildcard: Option<u8>,
73
74    /// The pattern to search for in the text.
75    pub pattern: Vec<u8>
76}
77
78impl BndmConfig {
79    /// Creates a new `BndmConfig` instance.
80    ///
81    /// # Arguments
82    ///
83    /// * `search_pattern` - The pattern to search for in the text.
84    /// * `wildcard` - An optional wildcard character. If provided, this character in the pattern
85    ///                can match any character in the text.
86    ///
87    /// # Returns
88    ///
89    /// * `BndmConfig` - A new `BndmConfig` instance.
90    ///
91    /// # Usage
92    ///
93    /// Without wildcard:
94    ///
95    /// ```rust
96    /// use bndm::BndmConfig;
97    ///
98    /// let pattern = b"jumps";
99    /// let wildcard = None;
100    /// let config = BndmConfig::new(pattern, wildcard);
101    /// ```
102    ///
103    /// With wildcard:
104    ///
105    /// ```rust
106    /// use bndm::BndmConfig;
107    ///
108    /// let pattern = b"ju??s";
109    /// let wildcard = b'?';
110    /// let config = BndmConfig::new(pattern, Some(wildcard));
111    /// ```
112    pub fn new(search_pattern: &[u8], wildcard: Option<u8>) -> BndmConfig {
113        let len = get_pattern_length_within_cpu_word(search_pattern.len());
114
115        BndmConfig {
116            masks: generate_masks(&search_pattern[..len], wildcard),
117            wildcard,
118            pattern: search_pattern.to_owned()
119        }
120    }
121}
122
123/// Searches for the pattern in the source string using the BNDM algorithm.
124///
125/// The function takes a source string and a `BndmConfig` as input. The `BndmConfig`
126/// contains the preprocessed pattern and the bitmasks. The function returns the index
127/// of the first occurrence of the pattern in the text, or `None` if the pattern is not
128/// found.
129///
130/// # Arguments
131///
132/// * `source` - The source string to search for the pattern.
133/// * `config` - The configuration for the BNDM search, which includes the pattern and the
134///              bitmasks.
135///
136/// # Returns
137///
138/// * `Option<usize>` - Returns the index of the first occurrence of the pattern in the text,
139///                     or `None` if the pattern is not found.
140///
141/// # Usage
142///
143/// Without wildcard:
144///
145/// ```rust
146/// use bndm::{BndmConfig, find_pattern};
147///
148/// let source = b"The quick brown fox jumps over the lazy dog";
149/// let pattern = b"jumps";
150/// let config = BndmConfig::new(pattern, None);
151/// let index = find_pattern(source, &config);
152/// assert_eq!(index, Some(20));
153/// ```
154///
155/// With wildcard:
156///
157/// ```rust
158/// use bndm::{BndmConfig, find_pattern};
159///
160/// let source = b"The quick brown fox jumps over the lazy dog";
161/// let pattern = b"ju??s";
162/// let config = BndmConfig::new(pattern, Some(b'?'));
163/// let index = find_pattern(source, &config);
164/// assert_eq!(index, Some(20));
165/// ```
166pub fn find_pattern(source: &[u8], config: &BndmConfig) -> Option<usize> {
167    match config.pattern.len() {
168        0 => None,
169        1 => config.wildcard
170            .map_or(false, |w| w == config.pattern[0]).then_some(0)
171            .or_else(|| source.iter().position(|&s| s == config.pattern[0])),
172        _ => find_pattern_bndm(source, config)
173    }
174}
175
176fn find_pattern_bndm(source: &[u8], config: &BndmConfig) -> Option<usize> {
177    if config.pattern.len() > source.len() {
178        return None;
179    }
180
181    let len = get_pattern_length_within_cpu_word(config.pattern.len()) - 1;
182    let end = source.len() - config.pattern.len();
183    let df = 1 << len;
184    let mut i = 0;
185
186    while i <= end {
187        let mut j = len;
188        let mut last = len;
189
190        let mut d = get_mask(source, config, i + j);
191        d = (d << 1) & get_mask(source, config, i + j - 1);
192        while d != 0 {
193            j -= 1;
194            if d & df != 0 {
195                if j == 0 {
196                    if find_remaining(source, config, i + WORD_SIZE_IN_BITS) {
197                        return Some(i);
198                    }
199                    j += 1;
200                }
201                last = j;
202            }
203            d = (d << 1) & get_mask(source, config, i + j - 1);
204        }
205
206        i += last;
207    }
208    None
209}
210
211fn get_mask(source: &[u8], config: &BndmConfig, index: usize) -> usize {
212    unsafe {
213        *config.masks.get_unchecked(*source.get_unchecked(index) as usize)
214    }
215}
216
217/// Checks if the remaining part of the pattern matches the source string.
218///
219/// This function is used when the pattern is longer than the CPU word size.
220/// It checks the remaining part of the pattern (after the first CPU word size characters)
221/// against the corresponding part of the source string.
222///
223/// # Arguments
224///
225/// * `source` - The source string to search for the pattern.
226/// * `config` - The configuration for the BNDM search, which includes the pattern and the
227///              wildcard character.
228/// * `start_index` - The index in the source string from where the remaining part of the
229///                   pattern should be checked.
230///
231/// # Returns
232///
233/// * `bool` - Returns `true` if the remaining part of the pattern matches the corresponding part of the source string, `false` otherwise.
234fn find_remaining(source: &[u8], config: &BndmConfig, start_index: usize) -> bool {
235    config.pattern.iter().skip(WORD_SIZE_IN_BITS).enumerate().all(|(index, &pattern_byte)| unsafe {
236        *source.get_unchecked(start_index + index) == pattern_byte || config.wildcard.map_or(false, |w| pattern_byte == w)
237    })
238}
239
240fn get_pattern_length_within_cpu_word(search_pattern_length: usize) -> usize {
241    min(search_pattern_length, WORD_SIZE_IN_BITS)
242}
243
244fn calculate_wildcard_mask(search_pattern: &[u8], wildcard: Option<u8>) -> usize {
245    wildcard.map_or(0, |wildcard| search_pattern.iter()
246        .fold(0, |mask, &pattern_byte| (mask << 1) | (pattern_byte == wildcard) as usize))
247}
248
249fn generate_masks(search_pattern: &[u8], wildcard: Option<u8>) -> [usize; MASKS_TABLE_SIZE] {
250    let default_mask = calculate_wildcard_mask(search_pattern, wildcard);
251    let mut masks = [default_mask; MASKS_TABLE_SIZE];
252
253    search_pattern.iter().rev().enumerate()
254        .for_each(|(i, &pattern_byte)| masks[pattern_byte as usize] |= 1 << i);
255
256    masks
257}
258
259#[cfg(test)]
260#[path = "./bndm_test.rs"]
261mod bndm_test;