use std::hash::Hash;
use crate::common::NumStdDev;
use crate::common::ResizeFactor;
use crate::common::binomial_bounds;
use crate::hash::DEFAULT_UPDATE_SEED;
use crate::theta::hash_table::DEFAULT_LG_K;
use crate::theta::hash_table::MAX_LG_K;
use crate::theta::hash_table::MAX_THETA;
use crate::theta::hash_table::MIN_LG_K;
use crate::theta::hash_table::ThetaHashTable;
#[derive(Debug)]
pub struct ThetaSketch {
table: ThetaHashTable,
}
impl ThetaSketch {
pub fn builder() -> ThetaSketchBuilder {
ThetaSketchBuilder::default()
}
pub fn update<T: Hash>(&mut self, value: T) {
let hash = self.table.hash_and_screen(value);
if hash != 0 {
self.table.try_insert(hash);
}
}
pub fn update_f64(&mut self, value: f64) {
let canonical = canonical_double(value);
self.update(canonical);
}
pub fn update_f32(&mut self, value: f32) {
self.update_f64(value as f64);
}
pub fn estimate(&self) -> f64 {
if self.is_empty() {
return 0.0;
}
let num_retained = self.table.num_entries() as f64;
let theta = self.table.theta() as f64 / MAX_THETA as f64;
num_retained / theta
}
pub fn theta(&self) -> f64 {
self.table.theta() as f64 / MAX_THETA as f64
}
pub fn theta64(&self) -> u64 {
self.table.theta()
}
pub fn is_empty(&self) -> bool {
self.table.is_empty()
}
pub fn is_estimation_mode(&self) -> bool {
self.table.theta() < MAX_THETA
}
pub fn num_retained(&self) -> usize {
self.table.num_entries()
}
pub fn lg_k(&self) -> u8 {
self.table.lg_nom_size()
}
pub fn trim(&mut self) {
self.table.trim();
}
pub fn reset(&mut self) {
self.table.reset();
}
pub fn iter(&self) -> impl Iterator<Item = u64> + '_ {
self.table.iter()
}
pub fn lower_bound(&self, num_std_dev: NumStdDev) -> f64 {
if !self.is_estimation_mode() {
return self.num_retained() as f64;
}
binomial_bounds::lower_bound(self.num_retained() as u64, self.theta(), num_std_dev)
.expect("theta should always be valid")
}
pub fn upper_bound(&self, num_std_dev: NumStdDev) -> f64 {
if !self.is_estimation_mode() {
return self.num_retained() as f64;
}
binomial_bounds::upper_bound(
self.num_retained() as u64,
self.theta(),
num_std_dev,
self.is_empty(),
)
.expect("theta should always be valid")
}
}
#[derive(Debug)]
pub struct ThetaSketchBuilder {
lg_k: u8,
resize_factor: ResizeFactor,
sampling_probability: f32,
seed: u64,
}
impl Default for ThetaSketchBuilder {
fn default() -> Self {
Self {
lg_k: DEFAULT_LG_K,
resize_factor: ResizeFactor::X8,
sampling_probability: 1.0,
seed: DEFAULT_UPDATE_SEED,
}
}
}
impl ThetaSketchBuilder {
pub fn lg_k(mut self, lg_k: u8) -> Self {
assert!(
(MIN_LG_K..=MAX_LG_K).contains(&lg_k),
"lg_k must be in [{}, {}], got {}",
MIN_LG_K,
MAX_LG_K,
lg_k
);
self.lg_k = lg_k;
self
}
pub fn resize_factor(mut self, factor: ResizeFactor) -> Self {
self.resize_factor = factor;
self
}
pub fn sampling_probability(mut self, probability: f32) -> Self {
assert!(
(0.0..=1.0).contains(&probability) && probability > 0.0,
"sampling_probability must be in (0.0, 1.0], got {probability}"
);
self.sampling_probability = probability;
self
}
pub fn seed(mut self, seed: u64) -> Self {
self.seed = seed;
self
}
pub fn build(self) -> ThetaSketch {
let table = ThetaHashTable::new(
self.lg_k,
self.resize_factor,
self.sampling_probability,
self.seed,
);
ThetaSketch { table }
}
}
fn canonical_double(value: f64) -> i64 {
if value.is_nan() {
0x7ff8000000000000i64
} else {
(value + 0.0).to_bits() as i64
}
}