use std::mem;
use packed::pattern::{PatternID, Patterns};
use Match;
type Hash = usize;
const NUM_BUCKETS: usize = 64;
#[derive(Clone, Debug)]
pub struct RabinKarp {
buckets: Vec<Vec<(Hash, PatternID)>>,
hash_len: usize,
hash_2pow: usize,
max_pattern_id: PatternID,
}
impl RabinKarp {
pub fn new(patterns: &Patterns) -> RabinKarp {
assert!(patterns.len() >= 1);
let hash_len = patterns.minimum_len();
assert!(hash_len >= 1);
let mut hash_2pow = 1usize;
for _ in 1..hash_len {
hash_2pow = hash_2pow.wrapping_shl(1);
}
let mut rk = RabinKarp {
buckets: vec![vec![]; NUM_BUCKETS],
hash_len,
hash_2pow,
max_pattern_id: patterns.max_pattern_id(),
};
for (id, pat) in patterns.iter() {
let hash = rk.hash(&pat.bytes()[..rk.hash_len]);
let bucket = hash % NUM_BUCKETS;
rk.buckets[bucket].push((hash, id));
}
rk
}
pub fn find_at(
&self,
patterns: &Patterns,
haystack: &[u8],
mut at: usize,
) -> Option<Match> {
assert_eq!(NUM_BUCKETS, self.buckets.len());
assert_eq!(
self.max_pattern_id,
patterns.max_pattern_id(),
"Rabin-Karp must be called with same patterns it was built with",
);
if at + self.hash_len > haystack.len() {
return None;
}
let mut hash = self.hash(&haystack[at..at + self.hash_len]);
loop {
let bucket = &self.buckets[hash % NUM_BUCKETS];
for &(phash, pid) in bucket {
if phash == hash {
if let Some(c) = self.verify(patterns, pid, haystack, at) {
return Some(c);
}
}
}
if at + self.hash_len >= haystack.len() {
return None;
}
hash = self.update_hash(
hash,
haystack[at],
haystack[at + self.hash_len],
);
at += 1;
}
}
pub fn heap_bytes(&self) -> usize {
let num_patterns = self.max_pattern_id as usize + 1;
self.buckets.len() * mem::size_of::<Vec<(Hash, PatternID)>>()
+ num_patterns * mem::size_of::<(Hash, PatternID)>()
}
#[cold]
fn verify(
&self,
patterns: &Patterns,
id: PatternID,
haystack: &[u8],
at: usize,
) -> Option<Match> {
let pat = patterns.get(id);
if pat.is_prefix(&haystack[at..]) {
Some(Match::from_span(id as usize, at, at + pat.len()))
} else {
None
}
}
fn hash(&self, bytes: &[u8]) -> Hash {
assert_eq!(self.hash_len, bytes.len());
let mut hash = 0usize;
for &b in bytes {
hash = hash.wrapping_shl(1).wrapping_add(b as usize);
}
hash
}
fn update_hash(&self, prev: Hash, old_byte: u8, new_byte: u8) -> Hash {
prev.wrapping_sub((old_byte as usize).wrapping_mul(self.hash_2pow))
.wrapping_shl(1)
.wrapping_add(new_byte as usize)
}
}