datasketches 0.2.0

A software library of stochastic streaming algorithms (a.k.a. sketches)
Documentation
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

//! Bloom Filter Serialization Compatibility Tests
//!
//! These tests verify binary compatibility with Apache DataSketches implementations:
//! - Java (datasketches-java)
//! - C++ (datasketches-cpp)
//!
//! Test data is generated by the reference implementations and stored in:
//! `tests/serialization_test_data/`

mod common;

use std::fs;
use std::path::PathBuf;

use common::serialization_test_data;
use datasketches::bloom::BloomFilter;

fn test_bloom_filter_file(path: PathBuf, expected_num_items: u64, expected_num_hashes: u16) {
    let bytes = fs::read(&path).unwrap();
    let filter1 = BloomFilter::deserialize(&bytes).unwrap();

    // Verify basic properties
    assert_eq!(
        filter1.num_hashes(),
        expected_num_hashes,
        "Wrong num_hashes in {}",
        path.display()
    );

    // Check empty state
    if expected_num_items == 0 {
        assert!(filter1.is_empty(), "Filter should be empty for n=0");
        assert_eq!(
            filter1.bits_used(),
            0,
            "Empty filter should have 0 bits set"
        );
    } else {
        assert!(
            !filter1.is_empty(),
            "Filter should not be empty for n={}",
            expected_num_items
        );
        assert!(
            filter1.bits_used() > 0,
            "Non-empty filter should have bits set"
        );
    }

    // Verify the items that were inserted (integers 0 to n/10-1)
    // C++ code: for (uint64_t i = 0; i < n / 10; ++i) bf.update(i);
    let num_inserted = expected_num_items / 10;

    if num_inserted > 0 {
        // Check a sample of inserted items
        // For large n, we only check a sample to keep tests fast
        let sample_size = std::cmp::min(num_inserted, 100);
        let mut false_negatives = 0;

        for i in 0..sample_size {
            if !filter1.contains(&i) {
                false_negatives += 1;
            }
        }

        assert_eq!(
            false_negatives,
            0,
            "Found {} false negatives out of {} items in {}",
            false_negatives,
            sample_size,
            path.display()
        );
    }

    // Serialize and deserialize again to test round-trip
    let serialized_bytes = filter1.serialize();
    let filter2 = BloomFilter::deserialize(&serialized_bytes).unwrap_or_else(|err| {
        panic!(
            "Deserialization failed after round-trip for {}: {}",
            path.display(),
            err
        )
    });

    // Check that both filters are functionally equivalent
    assert_eq!(
        filter1.num_hashes(),
        filter2.num_hashes(),
        "num_hashes mismatch after round-trip for {}",
        path.display()
    );
    assert_eq!(
        filter1.capacity(),
        filter2.capacity(),
        "capacity mismatch after round-trip for {}",
        path.display()
    );
    assert_eq!(
        filter1.bits_used(),
        filter2.bits_used(),
        "bits_used mismatch after round-trip for {}",
        path.display()
    );

    // Verify same items are present after round-trip
    if num_inserted > 0 {
        let sample_size = std::cmp::min(num_inserted, 100);
        for i in 0..sample_size {
            assert_eq!(
                filter1.contains(&i),
                filter2.contains(&i),
                "Item {} presence differs after round-trip",
                i
            );
        }
    }
}

#[test]
fn test_java_bloom_n0_h3() {
    let path = serialization_test_data("java_generated_files", "bf_n0_h3_java.sk");
    test_bloom_filter_file(path, 0, 3);
}

#[test]
fn test_java_bloom_n0_h5() {
    let path = serialization_test_data("java_generated_files", "bf_n0_h5_java.sk");
    test_bloom_filter_file(path, 0, 5);
}

#[test]
fn test_java_bloom_n10000_h3() {
    let path = serialization_test_data("java_generated_files", "bf_n10000_h3_java.sk");
    test_bloom_filter_file(path, 10000, 3);
}

#[test]
fn test_java_bloom_n10000_h5() {
    let path = serialization_test_data("java_generated_files", "bf_n10000_h5_java.sk");
    test_bloom_filter_file(path, 10000, 5);
}

#[test]
fn test_java_bloom_n2000000_h3() {
    let path = serialization_test_data("java_generated_files", "bf_n2000000_h3_java.sk");
    test_bloom_filter_file(path, 2000000, 3);
}

#[test]
fn test_java_bloom_n2000000_h5() {
    let path = serialization_test_data("java_generated_files", "bf_n2000000_h5_java.sk");
    test_bloom_filter_file(path, 2000000, 5);
}

#[test]
fn test_java_bloom_n30000000_h3() {
    let path = serialization_test_data("java_generated_files", "bf_n30000000_h3_java.sk");
    test_bloom_filter_file(path, 30000000, 3);
}

#[test]
fn test_java_bloom_n30000000_h5() {
    let path = serialization_test_data("java_generated_files", "bf_n30000000_h5_java.sk");
    test_bloom_filter_file(path, 30000000, 5);
}

#[test]
fn test_cpp_bloom_n0_h3() {
    let path = serialization_test_data("cpp_generated_files", "bf_n0_h3_cpp.sk");
    test_bloom_filter_file(path, 0, 3);
}

#[test]
fn test_cpp_bloom_n0_h5() {
    let path = serialization_test_data("cpp_generated_files", "bf_n0_h5_cpp.sk");
    test_bloom_filter_file(path, 0, 5);
}

#[test]
fn test_cpp_bloom_n10000_h3() {
    let path = serialization_test_data("cpp_generated_files", "bf_n10000_h3_cpp.sk");
    test_bloom_filter_file(path, 10000, 3);
}

#[test]
fn test_cpp_bloom_n10000_h5() {
    let path = serialization_test_data("cpp_generated_files", "bf_n10000_h5_cpp.sk");
    test_bloom_filter_file(path, 10000, 5);
}

#[test]
fn test_cpp_bloom_n2000000_h3() {
    let path = serialization_test_data("cpp_generated_files", "bf_n2000000_h3_cpp.sk");
    test_bloom_filter_file(path, 2000000, 3);
}

#[test]
fn test_cpp_bloom_n2000000_h5() {
    let path = serialization_test_data("cpp_generated_files", "bf_n2000000_h5_cpp.sk");
    test_bloom_filter_file(path, 2000000, 5);
}

#[test]
fn test_cpp_bloom_n30000000_h3() {
    let path = serialization_test_data("cpp_generated_files", "bf_n30000000_h3_cpp.sk");
    test_bloom_filter_file(path, 30000000, 3);
}

#[test]
fn test_cpp_bloom_n30000000_h5() {
    let path = serialization_test_data("cpp_generated_files", "bf_n30000000_h5_cpp.sk");
    test_bloom_filter_file(path, 30000000, 5);
}