#![allow(dead_code, unused_variables)]
use std::fs::File;
use std::hash::Hasher;
use std::io::{self, Read, Write};
use std::path::Path;
use fasthash::city::Hasher64 as CityHasher64;
use fasthash::murmur::Hasher32 as MurmurHasher32;
use fasthash::{CityHasher, FastHasher, MurmurHasher};
use serde::{Deserialize, Serialize};
use bitarray_naive::BitArray;
pub const DEFAULT_FALSE_POSITIVE_PROBABILITY: f32 = 0.4f32;
#[derive(Debug)]
pub enum SaveBloomFilterError {
Io(io::Error),
Serialize(serde_json::Error),
}
impl From<io::Error> for SaveBloomFilterError {
fn from(err: io::Error) -> Self {
return SaveBloomFilterError::Io(err);
}
}
impl From<serde_json::Error> for SaveBloomFilterError {
fn from(err: serde_json::Error) -> Self {
return SaveBloomFilterError::Serialize(err);
}
}
#[derive(Debug)]
pub enum LoadBloomFilterError {
Io(io::Error),
Serialize(serde_json::Error),
}
impl From<io::Error> for LoadBloomFilterError {
fn from(err: io::Error) -> Self {
return LoadBloomFilterError::Io(err);
}
}
impl From<serde_json::Error> for LoadBloomFilterError {
fn from(err: serde_json::Error) -> Self {
return LoadBloomFilterError::Serialize(err);
}
}
#[derive(Serialize, Deserialize)]
pub struct BloomFilter {
false_positive_probability: f32,
number_of_bits: u32,
items_count: u32,
number_of_hashes: u32,
bit_array: BitArray,
items_added: u32,
}
impl BloomFilter {
pub fn new(
false_positive_probability_opt: Option<f32>,
items_count: u32,
) -> Result<Self, String> {
if items_count == 0 {
return Err("The bloom filter's items count could not be 0.".to_owned());
}
let false_positive_probability: f32 =
false_positive_probability_opt.unwrap_or(DEFAULT_FALSE_POSITIVE_PROBABILITY);
if false_positive_probability <= 0.0 || false_positive_probability >= 1.0 {
return Err(
"The bloom filter's false positive probability should be in range from 0 to 1."
.to_owned(),
);
}
let number_of_bits: u32 =
Self::calc_best_number_of_bits(items_count, false_positive_probability);
let number_of_hashes: u32 =
Self::calc_best_number_of_hashes(false_positive_probability) as u32;
Ok(Self {
false_positive_probability,
number_of_bits,
items_count,
number_of_hashes,
bit_array: BitArray::new(number_of_bits as i64),
items_added: 0,
})
}
pub fn custom(
items_count: u32,
false_positive_probability_opt: Option<f32>,
number_of_bits_opt: Option<u32>,
number_of_hashes_opt: Option<u32>,
) -> Result<Self, String> {
if items_count == 0 {
return Err("The bloom filter's items count could not be 0.".to_owned());
}
let false_positive_probability: f32 =
false_positive_probability_opt.unwrap_or(DEFAULT_FALSE_POSITIVE_PROBABILITY);
if false_positive_probability <= 0.0 || false_positive_probability >= 1.0 {
return Err(
"The bloom filter's false positive probability should be in range from 0 to 1."
.to_owned(),
);
}
let number_of_bits: u32 = number_of_bits_opt.unwrap_or(Self::calc_best_number_of_bits(
items_count,
false_positive_probability,
));
let number_of_hashes: u32 = number_of_hashes_opt
.unwrap_or(Self::calc_best_number_of_hashes(false_positive_probability) as u32);
Ok(Self {
false_positive_probability,
number_of_bits,
items_count,
number_of_hashes,
bit_array: BitArray::new(number_of_bits as i64),
items_added: 0,
})
}
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self, LoadBloomFilterError> {
let mut _file = File::open(path)?;
let mut _buffer: String = String::new();
_file.read_to_string(&mut _buffer)?;
let bloom_filter: Self = serde_json::from_str::<Self>(&_buffer)?;
Ok(bloom_filter)
}
pub fn calc_best_number_of_bits(items_count: u32, false_positive_probability: f32) -> u32 {
-(items_count as f32 * false_positive_probability.ln() / f32::powf(f32::ln(2.0), 2.0))
as u32
}
pub fn calc_best_number_of_hashes(false_positive_probability: f32) -> i8 {
-f32::log2(false_positive_probability) as i8
}
pub fn _calc_random_bit_array_index(&mut self, item: &str, seed: u32) -> usize {
let mut murmur_hasher: MurmurHasher32 = MurmurHasher::new();
let mut city_hasher: CityHasher64 = CityHasher::new();
murmur_hasher.write(item.as_bytes());
city_hasher.write(item.as_bytes());
let aka_random_hash: u128 =
murmur_hasher.finish() as u128 + (seed as u128) * city_hasher.finish() as u128;
(aka_random_hash % self.number_of_bits as u128) as usize
}
pub fn insert(&mut self, item: &str) -> bool {
if self.items_added < self.items_count {
for i in 0..self.number_of_hashes {
let item_hash_index: usize = self._calc_random_bit_array_index(item, i);
self.bit_array.set(item_hash_index as i64, true).unwrap();
}
self.items_added += 1;
true
} else {
false
}
}
pub fn is_probably_present(&mut self, item: &str) -> bool {
for i in 0..self.number_of_hashes {
let item_hash_index: usize = self._calc_random_bit_array_index(item, i);
if !self.bit_array.get(item_hash_index as i64).unwrap() {
return false;
}
}
true
}
pub fn save<P: AsRef<Path>>(&self, path: P) -> Result<(), SaveBloomFilterError> {
let mut _file = File::create(path)?;
let _serialized_bfilter: String = serde_json::to_string(self)?;
_file.write_all(_serialized_bfilter.as_str().as_bytes())?;
Ok(())
}
}
#[cfg(test)]
mod tests {
use std::{fs, path::Path};
use crate::SaveBloomFilterError;
use super::BloomFilter;
#[test]
fn test_item_not_present() {
let item: &str = "John Green";
let wrong_item: &str = "John White";
let items_capacity = 250_000_000; let mut bloom_filter = match BloomFilter::new(Some(0.35), 2_000_0000) {
Ok(bloom_filter) => bloom_filter,
Err(msg) => panic!("{}", msg),
};
bloom_filter.insert(item);
let probably_present: bool = bloom_filter.is_probably_present(wrong_item);
assert_eq!(probably_present, false);
}
#[test]
fn test_serialize() {
let item: &str = "John Green";
let wrong_item: &str = "John White";
let items_capacity = 250_000_000; let mut bloom_filter = match BloomFilter::new(Some(0.35), 2_000_0000) {
Ok(bloom_filter) => bloom_filter,
Err(msg) => panic!("{}", msg),
};
bloom_filter.insert(item);
let tmp_save_path_ser: &Path = std::path::Path::new("./bfilter_ser.json");
let success: bool = match bloom_filter.save(tmp_save_path_ser) {
Ok(_) => true,
Err(_) => false,
};
assert!(success);
assert!(tmp_save_path_ser.exists());
fs::remove_file(tmp_save_path_ser).unwrap();
assert!(!tmp_save_path_ser.exists());
}
#[test]
fn test_serialize_invalid_path() {
let item: &str = "John Green";
let wrong_item: &str = "John White";
let items_capacity = 250_000_000; let mut bloom_filter = match BloomFilter::new(Some(0.35), 2_000_0000) {
Ok(bloom_filter) => bloom_filter,
Err(msg) => panic!("{}", msg),
};
bloom_filter.insert(item);
let tmp_save_path_ser: &Path = std::path::Path::new(".test/bfilter_ser.json");
let io_error_received: bool = match bloom_filter.save(tmp_save_path_ser) {
Ok(_) => false,
Err(SaveBloomFilterError::Io(err)) => true,
Err(SaveBloomFilterError::Serialize(err)) => false,
};
assert!(io_error_received);
}
#[test]
fn test_serialize_deserialize() {
let item: &str = "John Green";
let wrong_item: &str = "John White";
let items_capacity = 250_000_000; let mut bloom_filter = match BloomFilter::new(Some(0.35), 2_000_0000) {
Ok(bloom_filter) => bloom_filter,
Err(msg) => panic!("{}", msg),
};
bloom_filter.insert(item);
let probably_present: bool = bloom_filter.is_probably_present(wrong_item);
assert_eq!(probably_present, false);
let tmp_save_path_ser_deser: &Path = std::path::Path::new("./bfilter_ser_deser.json");
let success: bool = match bloom_filter.save(tmp_save_path_ser_deser) {
Ok(_) => true,
Err(_) => false,
};
assert!(success);
assert!(tmp_save_path_ser_deser.exists());
let mut loaded_bloom_filter: BloomFilter =
BloomFilter::from_file(tmp_save_path_ser_deser).unwrap();
fs::remove_file(tmp_save_path_ser_deser).unwrap();
assert!(!tmp_save_path_ser_deser.exists());
let probably_present: bool = loaded_bloom_filter.is_probably_present(wrong_item);
assert_eq!(probably_present, false);
}
#[test]
fn test_item_not_present_empty() {
let item: &str = "John Green";
let wrong_item: &str = "John White";
let items_capacity = 250_000_000; let mut bloom_filter = match BloomFilter::new(None, 2_000_0000) {
Ok(bloom_filter) => bloom_filter,
Err(msg) => panic!("{}", msg),
};
bloom_filter.insert(item);
let probably_present: bool = bloom_filter.is_probably_present(wrong_item);
assert_eq!(probably_present, false);
}
#[test]
fn test_item_probably_present() {
let item: &str = "John Green";
let mut bloom_filter = match BloomFilter::new(Some(0.35), 100) {
Ok(bloom_filter) => bloom_filter,
Err(msg) => panic!("{}", msg),
};
bloom_filter.insert(item);
let probably_present: bool = bloom_filter.is_probably_present(item);
assert_eq!(probably_present, true);
}
#[test]
#[should_panic(
expected = "The bloom filter's false positive probability should be in range from 0 to 1."
)]
fn test_init_with_wrong_false_positive_rate_smaller_zero() {
match BloomFilter::new(Some(0.0), 100) {
Ok(_) => (),
Err(msg) => panic!("{}", msg),
};
}
#[test]
#[should_panic(
expected = "The bloom filter's false positive probability should be in range from 0 to 1."
)]
fn test_init_with_wrong_false_positive_rate_bigger_one() {
match BloomFilter::new(Some(1.2), 100) {
Ok(_) => (),
Err(msg) => panic!("{}", msg),
};
}
#[test]
#[should_panic(expected = "The bloom filter's items count could not be 0.")]
fn test_init_with_wrong_number_of_elements() {
match BloomFilter::new(Some(0.32), 0) {
Ok(_) => (),
Err(msg) => panic!("{}", msg),
};
}
#[test]
fn test_insert_over_capacity() {
let items: [&str; 3] = ["John Green", "Steve Red", "Mark Adams"];
let last_item: &str = "John Doe";
let mut bloom_filter = match BloomFilter::new(Some(0.35), 3) {
Ok(bloom_filter) => bloom_filter,
Err(msg) => panic!("{}", msg),
};
for item in items {
assert_eq!(bloom_filter.insert(item), true);
}
assert_eq!(bloom_filter.insert(last_item), false);
}
#[test]
fn test_calc_best_number_of_bits_valid() {
let expected_items_count: u32 = 233_092;
let expected_false_positive_probability: f32 = 0.01;
let calculated_best_number_of_bits: u32 = BloomFilter::calc_best_number_of_bits(
expected_items_count,
expected_false_positive_probability,
);
assert!(calculated_best_number_of_bits > 0);
assert!(calculated_best_number_of_bits / expected_items_count as u32 > 8);
}
#[test]
fn test_calc_best_number_of_hashes() {
let expected_false_positive_probability: f32 = 0.01;
let calculated_best_number_of_hashes: i8 =
BloomFilter::calc_best_number_of_hashes(expected_false_positive_probability);
assert!(calculated_best_number_of_hashes > 0);
}
#[test]
fn test_calc_random_bit_array_index() {
let test_item: &str = "Hello test world!";
let test_seed: u32 = 2;
let test_false_positive_probability: f32 = 0.01;
let test_items_count: u32 = 923578;
let mut bloom_filter: BloomFilter =
match BloomFilter::new(Some(test_false_positive_probability), test_items_count) {
Ok(bloom_filter) => bloom_filter,
Err(msg) => panic!("{}", msg),
};
for i in 0..9999 {
bloom_filter._calc_random_bit_array_index(test_item, test_seed);
}
}
#[test]
fn test_with_custom_parameters() {
let test_item: &str = "Hello test world!";
let test_absent_item: &str = "Absent";
let test_false_positive_probability: f32 = 0.01;
let test_items_count: u32 = 923578;
let test_capacity: u32 = 923578 * 10;
let test_number_of_hashes: u32 = 4;
let mut bloom_filter: BloomFilter = match BloomFilter::custom(
test_items_count,
Some(test_false_positive_probability),
Some(test_capacity),
Some(test_number_of_hashes),
) {
Ok(bloom_filter) => bloom_filter,
Err(msg) => panic!("{}", msg),
};
bloom_filter.insert(test_item);
let probably_present: bool = bloom_filter.is_probably_present(test_absent_item);
assert_eq!(probably_present, false);
}
#[test]
fn test_with_custom_parameters_optional_empty() {
let test_item: &str = "Hello test world!";
let test_absent_item: &str = "Absent";
let test_items_count: u32 = 923578;
let mut bloom_filter: BloomFilter =
match BloomFilter::custom(test_items_count, None, None, None) {
Ok(bloom_filter) => bloom_filter,
Err(msg) => panic!("{}", msg),
};
bloom_filter.insert(test_item);
let probably_present: bool = bloom_filter.is_probably_present(test_absent_item);
assert_eq!(probably_present, false);
}
}