datasketches 0.2.0

A software library of stochastic streaming algorithms (a.k.a. sketches)
Documentation
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

//! HLL Sketch Serialization Compatibility Tests
//!
//! These tests verify binary compatibility with Apache DataSketches implementations:
//! - Java (datasketches-java)
//! - C++ (datasketches-cpp)
//!
//! Test data is generated by the reference implementations and stored in:
//! `tests/serialization_test_data/`

mod common;

use std::fs;
use std::path::PathBuf;

use common::serialization_test_data;
use datasketches::hll::HllSketch;

fn test_sketch_file(path: PathBuf, expected_cardinality: usize, expected_lg_k: u8) {
    let expected = expected_cardinality as f64;

    let bytes = fs::read(&path).unwrap();
    let sketch1 = HllSketch::deserialize(&bytes).unwrap();
    let estimate1 = sketch1.estimate();

    assert_eq!(
        sketch1.lg_config_k(),
        expected_lg_k,
        "Wrong lg_config_k in {}",
        path.display()
    );

    // Check cardinality estimate with error bounds
    // For lg_k=12, theoretical RSE ≈ 1.625%, but we use 2% margin to account for:
    // - Small sample sizes (especially n < 100)
    // - Out-of-order mode (composite estimator)
    // - Variation across implementations
    if expected > 0.0 {
        let error_margin = 0.02; // 2% error margin
        let lower_bound = expected * (1.0 - error_margin);
        let upper_bound = expected * (1.0 + error_margin);

        assert!(
            estimate1 >= lower_bound && estimate1 <= upper_bound,
            "Estimate {} outside bounds [{}, {}] for expected {} in {}",
            estimate1,
            lower_bound,
            upper_bound,
            expected,
            path.display()
        );
    } else {
        // For n=0, estimate should be very close to 0
        assert!(
            estimate1 < 1.0,
            "Expected near-zero estimate for empty sketch, got {} in {}",
            estimate1,
            path.display()
        );
    }

    // Serialize and deserialize again to test round-trip
    let serialized_bytes = sketch1.serialize();
    let sketch2 = HllSketch::deserialize(&serialized_bytes).unwrap_or_else(|err| {
        panic!(
            "Deserialization failed after round-trip for {}: {}",
            path.display(),
            err
        )
    });

    // Check that both sketches are functionally equivalent
    assert_eq!(
        sketch1.lg_config_k(),
        sketch2.lg_config_k(),
        "lg_config_k mismatch after round-trip for {}",
        path.display()
    );

    // Check that the sketches are functionally equal
    assert_eq!(
        sketch1,
        sketch2,
        "Sketches are not equal after round-trip for {}",
        path.display()
    );

    // Verify estimates match after round-trip
    let estimate2 = sketch2.estimate();
    assert_eq!(
        estimate1,
        estimate2,
        "Estimates differ after round-trip for {}",
        path.display()
    );
}

#[test]
fn test_java_hll4_compatibility() {
    let test_cases = [0, 1, 10, 100, 1000, 10000, 100000, 1000000];

    for n in test_cases {
        let filename = format!("hll4_n{}_java.sk", n);
        let path = serialization_test_data("java_generated_files", &filename);
        test_sketch_file(path, n, 12);
    }
}

#[test]
fn test_java_hll6_compatibility() {
    let test_cases = [0, 1, 10, 100, 1000, 10000, 100000, 1000000];

    for n in test_cases {
        let filename = format!("hll6_n{}_java.sk", n);
        let path = serialization_test_data("java_generated_files", &filename);
        test_sketch_file(path, n, 12);
    }
}

#[test]
fn test_java_hll8_compatibility() {
    let test_cases = [0, 1, 10, 100, 1000, 10000, 100000, 1000000];

    for n in test_cases {
        let filename = format!("hll8_n{}_java.sk", n);
        let path = serialization_test_data("java_generated_files", &filename);
        test_sketch_file(path, n, 12);
    }
}

#[test]
fn test_cpp_hll4_compatibility() {
    let test_cases = [0, 1, 10, 100, 1000, 10000, 100000, 1000000];

    for n in test_cases {
        let filename = format!("hll4_n{}_cpp.sk", n);
        let path = serialization_test_data("cpp_generated_files", &filename);
        test_sketch_file(path, n, 12);
    }
}

#[test]
fn test_cpp_hll6_compatibility() {
    let test_cases = [0, 1, 10, 100, 1000, 10000, 100000, 1000000];

    for n in test_cases {
        let filename = format!("hll6_n{}_cpp.sk", n);
        let path = serialization_test_data("cpp_generated_files", &filename);
        test_sketch_file(path, n, 12);
    }
}

#[test]
fn test_cpp_hll8_compatibility() {
    let test_cases = [0, 1, 10, 100, 1000, 10000, 100000, 1000000];

    for n in test_cases {
        let filename = format!("hll8_n{}_cpp.sk", n);
        let path = serialization_test_data("cpp_generated_files", &filename);
        test_sketch_file(path, n, 12);
    }
}

#[test]
fn test_estimate_accuracy() {
    // This test verifies and prints actual estimates to show accuracy
    let test_cases = [
        ("java_generated_files", "hll8_n1000_java.sk", 1000),
        ("java_generated_files", "hll8_n10000_java.sk", 10000),
        ("java_generated_files", "hll8_n100000_java.sk", 100000),
        ("java_generated_files", "hll8_n1000000_java.sk", 1000000),
    ];

    println!("\nCardinality Estimation Accuracy:");
    println!("{:<12} {:<12} {:<10}", "Expected", "Estimate", "Error %");
    println!("{:-<40}", "");

    for (dir, file, expected) in test_cases {
        let path = serialization_test_data(dir, file);
        let bytes = fs::read(&path).unwrap();
        let sketch = HllSketch::deserialize(&bytes).unwrap();
        let estimate = sketch.estimate();
        let error_pct = ((estimate - expected as f64).abs() / expected as f64) * 100.;

        println!("{:<12} {:<12.0} {:<10.3}", expected, estimate, error_pct,);

        // All estimates should be within 2% error
        assert!(error_pct < 2., "Error too high: {:.3}%", error_pct);
    }
}