use serde::{Deserialize, Serialize};
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
use scirs2_core::random::{Random, RngExt};
use crate::model::StarTriple;
use crate::StarResult;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BloomFilter {
bits: Vec<u64>,
bit_count: usize,
num_hashes: usize,
item_count: usize,
false_positive_rate: f64,
hash_seeds: Vec<u64>,
}
impl BloomFilter {
pub fn new(expected_items: usize, false_positive_rate: f64) -> Self {
let bit_count = Self::optimal_bit_count(expected_items, false_positive_rate);
let num_hashes = Self::optimal_hash_count(bit_count, expected_items);
let vec_size = (bit_count + 63) / 64;
let bits = vec![0u64; vec_size];
let mut rng = Random::seed(42); let hash_seeds: Vec<u64> = (0..num_hashes).map(|_| rng.random::<u64>()).collect();
Self {
bits,
bit_count,
num_hashes,
item_count: 0,
false_positive_rate,
hash_seeds,
}
}
fn optimal_bit_count(expected_items: usize, fp_rate: f64) -> usize {
let ln2_squared = std::f64::consts::LN_2 * std::f64::consts::LN_2;
let bit_count = -(expected_items as f64) * fp_rate.ln() / ln2_squared;
bit_count.ceil() as usize
}
fn optimal_hash_count(bit_count: usize, expected_items: usize) -> usize {
let k = (bit_count as f64 / expected_items as f64) * std::f64::consts::LN_2;
k.ceil().max(1.0) as usize
}
pub fn insert(&mut self, item: &[u8]) {
let hashes = self.compute_hashes(item);
for hash in hashes {
let bit_index = hash % self.bit_count;
let vec_index = bit_index / 64;
let bit_offset = bit_index % 64;
self.bits[vec_index] |= 1u64 << bit_offset;
}
self.item_count += 1;
}
pub fn contains(&self, item: &[u8]) -> bool {
let hashes = self.compute_hashes(item);
for hash in hashes {
let bit_index = hash % self.bit_count;
let vec_index = bit_index / 64;
let bit_offset = bit_index % 64;
if (self.bits[vec_index] & (1u64 << bit_offset)) == 0 {
return false; }
}
true }
fn compute_hashes(&self, item: &[u8]) -> Vec<usize> {
let mut hashes = Vec::with_capacity(self.num_hashes);
for &seed in &self.hash_seeds {
let mut hasher = DefaultHasher::new();
seed.hash(&mut hasher);
item.hash(&mut hasher);
hashes.push(hasher.finish() as usize);
}
hashes
}
pub fn estimated_false_positive_rate(&self) -> f64 {
if self.item_count == 0 {
return 0.0;
}
let k = self.num_hashes as f64;
let n = self.item_count as f64;
let m = self.bit_count as f64;
let exponent = -k * n / m;
let base = 1.0 - exponent.exp();
base.powf(k)
}
pub fn statistics(&self) -> BloomFilterStatistics {
let bits_set = self
.bits
.iter()
.map(|&chunk| chunk.count_ones() as usize)
.sum();
let fill_ratio = bits_set as f64 / self.bit_count as f64;
BloomFilterStatistics {
bit_count: self.bit_count,
bits_set,
fill_ratio,
item_count: self.item_count,
num_hashes: self.num_hashes,
configured_fp_rate: self.false_positive_rate,
estimated_fp_rate: self.estimated_false_positive_rate(),
memory_bytes: self.bits.len() * 8,
}
}
pub fn clear(&mut self) {
self.bits.fill(0);
self.item_count = 0;
}
pub fn should_resize(&self) -> bool {
self.estimated_false_positive_rate() > self.false_positive_rate * 2.0
}
}
#[derive(Debug, Clone)]
pub struct BloomFilterStatistics {
pub bit_count: usize,
pub bits_set: usize,
pub fill_ratio: f64,
pub item_count: usize,
pub num_hashes: usize,
pub configured_fp_rate: f64,
pub estimated_fp_rate: f64,
pub memory_bytes: usize,
}
pub struct TripleBloomFilter {
subject_filter: BloomFilter,
predicate_filter: BloomFilter,
object_filter: BloomFilter,
triple_filter: BloomFilter,
}
impl TripleBloomFilter {
pub fn new(expected_triples: usize, false_positive_rate: f64) -> Self {
Self {
subject_filter: BloomFilter::new(expected_triples, false_positive_rate),
predicate_filter: BloomFilter::new(expected_triples / 10, false_positive_rate), object_filter: BloomFilter::new(expected_triples, false_positive_rate),
triple_filter: BloomFilter::new(expected_triples, false_positive_rate),
}
}
pub fn insert_triple(&mut self, triple: &StarTriple) -> StarResult<()> {
let subject_bytes = format!("{:?}", triple.subject).into_bytes();
let predicate_bytes = format!("{:?}", triple.predicate).into_bytes();
let object_bytes = format!("{:?}", triple.object).into_bytes();
let triple_bytes = format!("{:?}", triple).into_bytes();
self.subject_filter.insert(&subject_bytes);
self.predicate_filter.insert(&predicate_bytes);
self.object_filter.insert(&object_bytes);
self.triple_filter.insert(&triple_bytes);
Ok(())
}
pub fn might_contain_triple(&self, triple: &StarTriple) -> bool {
let triple_bytes = format!("{:?}", triple).into_bytes();
self.triple_filter.contains(&triple_bytes)
}
pub fn might_have_subject(&self, subject: &str) -> bool {
self.subject_filter.contains(subject.as_bytes())
}
pub fn might_have_predicate(&self, predicate: &str) -> bool {
self.predicate_filter.contains(predicate.as_bytes())
}
pub fn might_have_object(&self, object: &str) -> bool {
self.object_filter.contains(object.as_bytes())
}
pub fn statistics(&self) -> TripleBloomFilterStatistics {
TripleBloomFilterStatistics {
subject_stats: self.subject_filter.statistics(),
predicate_stats: self.predicate_filter.statistics(),
object_stats: self.object_filter.statistics(),
triple_stats: self.triple_filter.statistics(),
}
}
pub fn clear(&mut self) {
self.subject_filter.clear();
self.predicate_filter.clear();
self.object_filter.clear();
self.triple_filter.clear();
}
}
#[derive(Debug, Clone)]
pub struct TripleBloomFilterStatistics {
pub subject_stats: BloomFilterStatistics,
pub predicate_stats: BloomFilterStatistics,
pub object_stats: BloomFilterStatistics,
pub triple_stats: BloomFilterStatistics,
}
impl TripleBloomFilterStatistics {
pub fn total_memory_bytes(&self) -> usize {
self.subject_stats.memory_bytes
+ self.predicate_stats.memory_bytes
+ self.object_stats.memory_bytes
+ self.triple_stats.memory_bytes
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::model::StarTerm;
#[test]
fn test_bloom_filter_basic() {
let mut filter = BloomFilter::new(100, 0.01);
filter.insert(b"item1");
filter.insert(b"item2");
filter.insert(b"item3");
assert!(filter.contains(b"item1"));
assert!(filter.contains(b"item2"));
assert!(filter.contains(b"item3"));
assert!(!filter.contains(b"item4"));
}
#[test]
fn test_bloom_filter_false_positive_rate() {
let mut filter = BloomFilter::new(1000, 0.01);
for i in 0..1000 {
let item = format!("item{}", i);
filter.insert(item.as_bytes());
}
let mut false_positives = 0;
let test_count = 10000;
for i in 1000..1000 + test_count {
let item = format!("item{}", i);
if filter.contains(item.as_bytes()) {
false_positives += 1;
}
}
let fp_rate = false_positives as f64 / test_count as f64;
assert!(fp_rate < 0.05, "False positive rate too high: {}", fp_rate);
}
#[test]
fn test_bloom_filter_statistics() {
let mut filter = BloomFilter::new(100, 0.01);
for i in 0..50 {
filter.insert(&[i as u8]);
}
let stats = filter.statistics();
assert_eq!(stats.item_count, 50);
assert!(stats.fill_ratio > 0.0 && stats.fill_ratio < 1.0);
assert!(stats.estimated_fp_rate <= stats.configured_fp_rate * 2.0);
}
#[test]
fn test_triple_bloom_filter() {
let mut filter = TripleBloomFilter::new(1000, 0.01);
let triple1 = StarTriple::new(
StarTerm::iri("http://example.org/s1").unwrap(),
StarTerm::iri("http://example.org/p1").unwrap(),
StarTerm::iri("http://example.org/o1").unwrap(),
);
let triple2 = StarTriple::new(
StarTerm::iri("http://example.org/s2").unwrap(),
StarTerm::iri("http://example.org/p2").unwrap(),
StarTerm::iri("http://example.org/o2").unwrap(),
);
filter.insert_triple(&triple1).unwrap();
assert!(filter.might_contain_triple(&triple1));
assert!(!filter.might_contain_triple(&triple2));
}
#[test]
fn test_triple_component_queries() {
let mut filter = TripleBloomFilter::new(1000, 0.01);
let triple = StarTriple::new(
StarTerm::iri("http://example.org/alice").unwrap(),
StarTerm::iri("http://example.org/knows").unwrap(),
StarTerm::iri("http://example.org/bob").unwrap(),
);
filter.insert_triple(&triple).unwrap();
let subject_str = format!("{:?}", triple.subject);
let predicate_str = format!("{:?}", triple.predicate);
let object_str = format!("{:?}", triple.object);
assert!(filter.might_have_subject(&subject_str));
assert!(filter.might_have_predicate(&predicate_str));
assert!(filter.might_have_object(&object_str));
}
#[test]
fn test_bloom_filter_clear() {
let mut filter = BloomFilter::new(100, 0.01);
filter.insert(b"item1");
assert!(filter.contains(b"item1"));
filter.clear();
assert_eq!(filter.item_count, 0);
}
#[test]
fn test_optimal_parameters() {
let bit_count = BloomFilter::optimal_bit_count(1000, 0.01);
let hash_count = BloomFilter::optimal_hash_count(bit_count, 1000);
assert!(bit_count > 1000); assert!((1..=10).contains(&hash_count)); }
}