1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
//! Boyer-Moore-Horspool
//!

extern crate memchr;
use std::cmp;

fn bmh_skip(pat: &[u8], skip: &mut [u16; 256]) {
    let pat_skip = cmp::min(pat.len(), u16::max_value() as usize) as u16;
    for entry in skip.iter_mut() {
        *entry = pat_skip;
    }

    for (index, &byte) in pat[..pat.len() - 1].iter().enumerate() {
        skip[byte as usize] = cmp::min(pat.len() - index - 1, u16::max_value() as usize) as u16;
    }
}

/// Boyer-Moore-Horspool substring search
pub fn find(text: &[u8], pat: &[u8]) -> Option<usize> {
    let mut skip = [0; 256];
    bmh_skip(pat, &mut skip);

    let pat_len = pat.len();

    if pat_len == 0 {
        return Some(0);
    }

    let pat_len_m1 = pat_len - 1;
    let pat_last = pat[pat_len - 1];

    // initial search by memchr
    let mut j = match memchr::memchr(pat[0], text) {
        Some(x) => x,
        None => return None,
    };

    while let Some(&c) = text.get(j + pat_len_m1) {
        // check the back character of the pattern
        if c == pat_last && &text[j..j + pat_len] == pat {
            return Some(j);
        }
        j += skip[c as usize] as usize;
    }
    None
}

#[test]
fn bmh_preprocess() {
    let mut skip = [0; 256];
    let needle = b"gcagagag";
    bmh_skip(needle, &mut skip);
    assert_eq!(skip[b'g' as usize], 2);
    assert_eq!(skip[b'c' as usize], 6);
    assert_eq!(skip[b'a' as usize], 1);
    assert_eq!(skip[b't' as usize], 8);
}

#[test]
fn bmh_find() {
    let text = b"abc";
    assert_eq!(find(text, b"d"), None);
    assert_eq!(find(text, b"c"), Some(2));

    let longer = "longer text and so on";

    // test all windows
    for wsz in 1..17 {
        for window in longer.as_bytes().windows(wsz) {
            let str_find = longer.find(::std::str::from_utf8(window).unwrap());
            assert!(str_find.is_some());
            assert_eq!(find(longer.as_bytes(), window), str_find);
        }
    }

    let pat = b"ger text and so on";
    assert!(pat.len() > 16);
    assert_eq!(Some(3), find(longer.as_bytes(), pat));
}