hyperloglog-rs 0.1.56

A Rust implementation of HyperLogLog trying to be parsimonious with memory.
Documentation
r"""
Script to pre-compute the constants associated to the computation of the
cardinalities used in the HyperLogLog algorithm in the version that employes
the multiplicities of the registers. The paper that describes the algorithm is
the following [New cardinality estimation algorithms for HyperLogLog sketches](https://oertl.github.io/hyperloglog-sketch-estimation-paper/paper/paper.pdf).

Specifically, the two formulas we need to precompute are the following.

For a multiplicity x, that can have value between 0 and the number of registers m,
which we recall are equal to m=2^P, we need to compute the following
constant:
$$\tau(x) = \frac{x}{3} - \frac{m}{3} \sum_{k=1}^{\inf} {\left(1 - {\left(1 - \frac{x}{m}\right)}^{2^{-k}}\right)}^{2} 2^{-k}$$

The second constant we need to compute is sigma, also defined for a multiplicity
x, that can have value between 0 and the number of registers m, which we recall
are equal to m=2^P, is defined as follows:

$$\sigma(x) = x + m \sum_{k=1}^{\inf} {\frac{x}{m}}^{2^k} 2^{k-1}$$

Since we cannot compute the infinite sum, we approximate it by computing the
terms up until convergence, which is when the value of the term is smaller than
the precision of the machine.

We write out the constants as two Rust files that can be readily imported in the
hyperloglog-rs crate, in the positions "src/tau_multiplicities_constants.rs" and
"src/sigma_multiplicities_constants.rs".
We compute the values for all precisions we considered, between 4 and 18.

The files will header will look like the following, with no more than 3 decimals
for each constant.

```rust
//! This file is generated by the script multiplities_constants_precomputation.py
//! and contains the constants used in the computation of the cardinalities
//! in the HyperLogLog algorithm that employs the multiplicities of the registers.
//! The paper that describes the algorithm is the following
//! [New cardinality estimation algorithms for HyperLogLog sketches](https://oertl.github.io/hyperloglog-sketch-estimation-paper/paper/paper.pdf).
//! The constants are computed for all precisions between 4 and 18.

pub const TAU: [&[f32]; 15] = [
    // precision 4
    &[
       0.0,
       0.33,
       0.654,
       0.97,
       1.278,
       1.577,
       1.864,
       2.139,
       2.399,
```
"""

import sys
from tqdm.auto import trange

def compute_tau(x, m):
    r"""
    Compute the constant tau(x) for a given multiplicity x and number of registers m.

    Parameters
    ----------
    x : int
        The multiplicity of the register.
    m : int
        The number of registers.
    """
    summation = 0
    k = 1
    x = (1 - x / m)
    while True:
        term = (1 - x**(2**(-k)))**2 * 2**(-k)
        summation += term
        if term < sys.float_info.epsilon:
            break
        k += 1
    return m / 3.0 * (1 - x - summation)

def compute_sigma(x, m):
    r"""
    Compute the constant sigma(x) for a given multiplicity x and number of registers m.

    Parameters
    ----------
    x : int
        The multiplicity of the register.
    m : int
        The number of registers.
    """
    summation = 0
    k = 1
    x = x / m
    try:
        while True:
            term = x**(2**k) * 2**(k - 1)
            summation += term
            if term < sys.float_info.epsilon:
                break
            k += 1
    except OverflowError:
        return float("inf")
    return m * (x + summation)


def write_out_tau():
    r"""
    Compute the constants for all precisions and write them to a Rust file.
    """
    with open("src/tau_multiplicities_constants.rs", "w", encoding="utf8") as file:
        file.write(
            r"""//! This file is generated by the script multiplities_constants_precomputation.py
//! and contains the constants used in the computation of the cardinalities
//! in the HyperLogLog algorithm that employs the multiplicities of the registers.
//! The paper that describes the algorithm is the following
//! [New cardinality estimation algorithms for HyperLogLog sketches](https://oertl.github.io/hyperloglog-sketch-estimation-paper/paper/paper.pdf).
//! The constants are computed for all precisions between 4 and 18.

pub const TAU: [&[f32]; 15] = [
""")
        for precision in trange(4, 19, desc="Precision", leave=False):
            file.write(f"    // precision {precision}\n")
            file.write("    &[")
            for multeplicity in range(0, 2**precision + 1):
                tau = compute_tau(multeplicity, 2**precision)

                # We round the tau so that it does not have more than
                # 3 decimals.
                tau = round(tau, 3)

                # If tau is higher than 10k, we round it so that it
                # does not have more than 2 decimals.
                if tau > 10000:
                    tau = round(tau, 2)

                file.write(
                    f"\n       {tau},")
            file.write("\n    ],\n")
        file.write("];\n")

def write_out_sigma():
    r"""
    Compute the constants for all precisions and write them to a Rust file.
    """
    with open("src/sigma_multiplicities_constants.rs", "w", encoding="utf8") as file:
        file.write(
            r"""//! This file is generated by the script multiplities_constants_precomputation.py
//! and contains the constants used in the computation of the cardinalities
//! in the HyperLogLog algorithm that employs the multiplicities of the registers.
//! The paper that describes the algorithm is the following
//! [New cardinality estimation algorithms for HyperLogLog sketches](https://oertl.github.io/hyperloglog-sketch-estimation-paper/paper/paper.pdf).
//! The constants are computed for all precisions between 4 and 18.

pub const SIGMA: [&[f32]; 15] = [
""")
        for precision in trange(4, 19, desc="Precision", leave=False):
            file.write(f"    // precision {precision}\n")
            file.write("    &[")
            for multeplicity in range(0, 2**precision + 1):
                sigma = compute_sigma(multeplicity, 2**precision)

                # We round the sigma so that it does not have more than
                # 3 decimals.
                sigma = round(sigma, 6)

                if sigma > 10:
                    sigma = round(sigma, 5)
                
                if sigma > 100:
                    sigma = round(sigma, 4)

                if sigma > 1000:
                    sigma = round(sigma, 3)

                # If sigma is higher than 10k, we round it so that it
                # does not have more than 2 decimals.
                if sigma > 10000:
                    sigma = round(sigma, 2)

                if sigma == 0.434294 or sigma == 0.4342:
                    sigma = "core::f32::consts::LOG10_E"

                if sigma == 0.785398:
                    sigma = "core::f32::consts::FRAC_PI_4"

                if sigma == 0.31831 or sigma == 0.318:
                    sigma = "core::f32::consts::FRAC_1_PI"

                if sigma == float("inf"):
                    sigma = "core::f32::INFINITY"

                file.write(
                    f"\n       {sigma},")
            file.write("\n    ],\n")
        file.write("];\n")


if __name__ == "__main__":
    write_out_tau()
    write_out_sigma()