use std::ops::Bound::{Included,Excluded};
use std::fmt::{Formatter,Debug,Result};
use float_extras::f64::frexp;
use skiplist::SkipMap;
#[derive(Copy, Clone, Debug)]
struct HistogramBin {
pub sample : f64,
pub weight : f64
}
impl HistogramBin {
pub fn new(sample: f64, weight: f64) -> Self {
HistogramBin {
sample: sample,
weight: weight
}
}
fn validate(&self) {
if self.weight < 0.0 || !self.weight.is_finite() {
panic!("HistogramBin weight must be a non-negative, finite number.");
}
}
pub fn set(&mut self, copy_from: Self) {
self.sample = copy_from.sample;
self.weight = copy_from.weight;
self.validate();
}
}
enum SampleCategory {
Negative,
Zero,
Positive,
NegativeInfinity,
PositiveInfinity,
NotANumber
}
impl SampleCategory {
pub fn categorize(sample : f64) -> Self {
match sample {
x if x > 0.0 && x != f64::INFINITY => SampleCategory::Positive,
x if x < 0.0 && x != f64::NEG_INFINITY => SampleCategory::Negative,
x if x == 0.0 => SampleCategory::Zero,
x if x == f64::INFINITY => SampleCategory::PositiveInfinity,
x if x == f64::NEG_INFINITY => SampleCategory::NegativeInfinity,
_ => SampleCategory::NotANumber
}
}
}
pub struct Quantogram {
growth : f64,
bins_per_doubling : usize,
growth_bin_power_memo : Vec<f64>,
total_count : usize,
total_weight : f64,
weighted_sum : f64,
nan_weight : f64,
plus_ininity_weight : f64,
minus_ininity_weight : f64,
negative_weight : f64,
zero_weight : f64,
positive_weight : f64,
maximum : f64,
minimum : f64,
only_integers : bool,
negative_overflow : f64,
negative_underflow : f64,
positive_overflow : f64,
positive_underflow : f64,
positive_coarse_bins : SkipMap<isize, HistogramBin>,
negative_coarse_bins : SkipMap<isize, HistogramBin>,
fine_bins : SkipMap<isize, HistogramBin>,
log_of_growth : f64,
mode_cache : ModeCache
}
impl Quantogram {
pub fn new() -> Self {
let bins = 35;
let growth = 1.02;
let mut q = Quantogram {
growth : growth,
bins_per_doubling : bins,
growth_bin_power_memo : Vec::new(),
total_count : 0,
total_weight : 0.0,
weighted_sum : 0.0,
nan_weight : 0.0,
plus_ininity_weight : 0.0,
minus_ininity_weight : 0.0,
negative_weight : 0.0,
zero_weight : 0.0,
positive_weight : 0.0,
maximum : f64::NAN,
minimum : f64::NAN,
only_integers : true,
negative_underflow : -(2.0_f64.powi(-126)),
negative_overflow : -(2.0_f64.powi(127)),
positive_underflow : 2.0_f64.powi(-126),
positive_overflow : 2.0_f64.powi(127),
positive_coarse_bins : SkipMap::with_capacity(254),
negative_coarse_bins : SkipMap::with_capacity(254),
fine_bins : SkipMap::with_capacity(17641),
log_of_growth : growth.ln(),
mode_cache : ModeCache::new()
};
q.memoize_growth_bin_power();
q
}
pub fn with_configuration(growth: f64, bins: usize, smallest_power: isize, largest_power: isize) -> Self {
let valid_power = -126..=127;
if !valid_power.contains(&smallest_power) || !valid_power.contains(&largest_power) {
panic!("Power of two constraints must be in range [-126,127]");
}
if smallest_power >= largest_power {
panic!("Power of two constraints not given in ascending order");
}
let mut q = Quantogram {
growth : growth,
bins_per_doubling : bins,
growth_bin_power_memo : Vec::new(),
total_count : 0,
total_weight : 0.0,
weighted_sum : 0.0,
nan_weight : 0.0,
plus_ininity_weight : 0.0,
minus_ininity_weight : 0.0,
negative_weight : 0.0,
zero_weight : 0.0,
positive_weight : 0.0,
maximum : f64::NAN,
minimum : f64::NAN,
only_integers : true,
negative_underflow : -(2.0_f64.powi(smallest_power as i32)),
negative_overflow : -(2.0_f64.powi(largest_power as i32)),
positive_underflow : 2.0_f64.powi(smallest_power as i32),
positive_overflow : 2.0_f64.powi(largest_power as i32),
positive_coarse_bins : SkipMap::with_capacity(254),
negative_coarse_bins : SkipMap::with_capacity(254),
fine_bins : SkipMap::with_capacity(((largest_power - smallest_power + 1)*35 + 1) as usize),
log_of_growth : growth.ln(),
mode_cache : ModeCache::new()
};
q.memoize_growth_bin_power();
q
}
pub fn add(&mut self, sample: f64) {
self.add_weighted(sample, 1.0);
}
pub fn remove(&mut self, sample: f64) {
self.add_weighted(sample, -1.0);
}
pub fn add_weighted(&mut self, sample: f64, weight: f64) -> f64 {
let bounded_sample = self.over_or_underflow(sample);
let adjusted_fine_weight;
self.adjust_aggregates(bounded_sample, weight);
match SampleCategory::categorize(bounded_sample) {
SampleCategory::Positive => {
self.positive_weight += weight;
let (fine_key, fine_low, fine_midpoint, fine_high) = self.get_fine_key_with_midpoint(bounded_sample, self.growth, self.bins_per_doubling).unwrap();
adjusted_fine_weight = Self::increment_key_weight(&mut self.fine_bins, fine_key, fine_midpoint, sample, weight);
self.adjust_mode(fine_key, fine_low, fine_midpoint, fine_high, adjusted_fine_weight);
let (coarse_key, coarse_midpoint) = Self::get_coarse_key_with_midpoint(bounded_sample).unwrap();
Self::increment_key_weight(&mut self.positive_coarse_bins, coarse_key, coarse_midpoint, sample, weight);
},
SampleCategory::Zero => {
self.zero_weight += weight;
let (fine_key, fine_low, fine_midpoint, fine_high) = self.get_fine_key_with_midpoint(bounded_sample, self.growth, self.bins_per_doubling).unwrap();
adjusted_fine_weight = Self::increment_key_weight(&mut self.fine_bins, fine_key, fine_midpoint, sample, weight);
self.adjust_mode(fine_key, fine_low, fine_midpoint, fine_high, adjusted_fine_weight);
},
SampleCategory::Negative => {
self.negative_weight += weight;
let (fine_key, fine_low, fine_midpoint, fine_high) = self.get_fine_key_with_midpoint(bounded_sample, self.growth, self.bins_per_doubling).unwrap();
adjusted_fine_weight = Self::increment_key_weight(&mut self.fine_bins, fine_key, fine_midpoint, sample, weight);
self.adjust_mode(fine_key, fine_low, fine_midpoint, fine_high, adjusted_fine_weight);
let (coarse_key, coarse_midpoint) = Self::get_coarse_key_with_midpoint(bounded_sample).unwrap();
Self::increment_key_weight(&mut self.negative_coarse_bins, coarse_key, coarse_midpoint, sample, weight);
},
SampleCategory::NotANumber => {
self.nan_weight += weight;
adjusted_fine_weight = self.nan_weight;
},
SampleCategory::PositiveInfinity => {
self.plus_ininity_weight += weight;
adjusted_fine_weight = self.plus_ininity_weight;
},
SampleCategory::NegativeInfinity => {
self.minus_ininity_weight += weight;
adjusted_fine_weight = self.minus_ininity_weight;
}
}
adjusted_fine_weight
}
pub fn add_unweighted_samples<'a>(&mut self, samples: impl Iterator<Item = &'a f64>) {
for sample in samples {
self.add(*sample);
}
}
pub fn mean(&self) -> Option<f64> {
if self.total_weight > 0.0 { Some(self.weighted_sum / self.total_weight) }
else { None }
}
pub fn min(&self) -> Option<f64> {
if self.minimum.is_finite() { Some(self.minimum) }
else { None }
}
pub fn max(&self) -> Option<f64> {
if self.maximum.is_finite() { Some(self.maximum) }
else { None }
}
pub fn count(&self) -> usize {
self.total_count
}
pub fn finite(&self) -> f64 {
if self.total_count == 0 {
f64::NAN
}
else {
let numerator = self.total_weight;
let denominator = self.added_weight();
numerator / denominator
}
}
pub fn zero(&self) -> f64 {
if self.total_count == 0 {
f64::NAN
}
else {
let numerator = self.zero_weight;
let denominator = self.added_weight();
numerator / denominator
}
}
pub fn nan(&self) -> f64 {
if self.total_count == 0 {
f64::NAN
}
else {
let numerator = self.nan_weight;
let denominator = self.added_weight();
numerator / denominator
}
}
pub fn median(&self) -> Option<f64> {
self.quantile(0.5)
}
pub fn mode(&self) -> Vec<f64> {
let unrounded_mode = self.mode_cache.mode(&self.fine_bins);
if self.only_integers {
let rounded_mode = unrounded_mode
.into_iter()
.map(|x| x.round())
.collect();
rounded_mode
}
else {
unrounded_mode
}
}
pub fn quantile(&self, phi: f64) -> Option<f64> {
if phi < 0.0 || phi > 1.0 || self.total_weight <= 0.0 {
None
}
else if phi == 0.0 {
self.min()
}
else if phi == 1.0 {
self.max()
}
else {
let mut target_cume_weight = self.total_weight * phi;
if target_cume_weight <= self.negative_weight {
let (coarse_index, remainder) = Self::find_containing_bucket(target_cume_weight, &self.negative_coarse_bins).unwrap();
let high_fine_index = ((-coarse_index + 126) * (self.bins_per_doubling as isize) + 0 + 1) * -1 + 1; let low_fine_index = high_fine_index - (self.bins_per_doubling as isize) + 1; let (_fine_index, _remainder2, sample_at_quantile) = Self::find_containing_bucket_in_range(low_fine_index, high_fine_index, remainder, &self.fine_bins).unwrap();
return Some(self.conditional_round(sample_at_quantile));
}
target_cume_weight -= self.negative_weight;
if target_cume_weight <= self.zero_weight {
return Some(0.0);
}
else {
target_cume_weight -= self.zero_weight;
}
let (coarse_index, remainder) = Self::find_containing_bucket(target_cume_weight, &self.positive_coarse_bins).unwrap();
let low_fine_index = ((coarse_index + 126) * (self.bins_per_doubling as isize) + 0 + 1) * 1;
let high_fine_index = low_fine_index + (self.bins_per_doubling as isize); let (_fine_index, _remainder2, sample_at_quantile)
= Self::find_containing_bucket_in_range(low_fine_index, high_fine_index, remainder, &self.fine_bins)
.unwrap();
return Some(self.conditional_round(sample_at_quantile));
}
}
pub fn fussy_quantile(&self, phi: f64, threshold_ratio: f64) -> Option<f64> {
if phi < 0.0 || phi > 1.0 || self.total_weight <= 0.0 {
None
}
else if phi == 0.0 {
self.min()
}
else if phi == 1.0 {
self.max()
}
else {
let epsilon = 1.0 / (2.0 * self.total_count as f64);
let q_middle = self.quantile(phi);
if phi <= 1.5 * epsilon || phi >= 1.0 - 1.5 * epsilon || q_middle.is_none() {
q_middle
}
else {
let q_low = self.quantile(phi - epsilon).unwrap();
let q_high = self.quantile(phi + epsilon).unwrap();
let q_median = q_middle.unwrap();
let low_difference = q_median - q_low;
let high_difference = q_high - q_median;
if low_difference >= high_difference * threshold_ratio {
Some((q_low + q_median)/2.0)
}
else if high_difference >= low_difference * threshold_ratio {
Some((q_high + q_median)/2.0)
}
else {
None
}
}
}
}
pub fn quantile_at(&self, value: f64) -> Option<(f64,f64)> {
let min_opt = self.min();
if min_opt.is_none() {
return None;
}
let min = min_opt.unwrap();
let max = self.max().unwrap();
match self.count() {
0 => None,
1 => {
if value < min { Some((0.0, 0.0)) }
else if value > min { Some((1.0, 1.0)) }
else { Some((0.0, 1.0)) }
},
_ if value < min => Some((0.0, 0.0)),
_ if value > max => Some((1.0, 1.0)),
_ if min == max => Some((0.0, 1.0)),
_ if value == min => Some((0.0, 0.0)),
_ if value == max => Some((1.0, 1.0)),
_ if value == 0.0 => {
let neg = self.negative_weight;
let sum = self.total_weight;
let zero = self.zero_weight;
Some((neg / sum, (neg + zero)/sum))
},
_ => {
let (end_key, _fine_low, _fine_midpoint, _fine_high)
= self.get_fine_key_with_midpoint(value, self.growth, self.bins_per_doubling).unwrap();
let (start_key, start_weight) =
if value >= 0.0 { (1_isize, self.negative_weight + self.zero_weight) }
else { (isize::MIN , 0.0) };
let mut cume_weight = start_weight;
let sum = self.total_weight;
for (key, bucket) in self.fine_bins.range(Included(&start_key), Included(&isize::MAX)) {
let weight = bucket.weight;
if *key == end_key {
return Some((cume_weight / sum, (cume_weight + weight)/sum));
}
else if *key > end_key {
return Some((cume_weight / sum, cume_weight/sum));
}
cume_weight += weight;
}
None
}
}
}
pub fn size(&self) -> usize {
7 + self.positive_coarse_bins.len()
+ self.negative_coarse_bins.len()
+ self.fine_bins.len()
}
fn memoize_growth_bin_power(&mut self) {
for bin in 0..=self.bins_per_doubling {
self.growth_bin_power_memo.push(self.growth.powi(bin as i32));
}
}
fn pade_approximate_log_floor(&self, x: f64) -> isize {
let x2 = x * x;
let x3 = x2 * x;
let numerator = (11.0/27.0) * x3 + x2 - x - (11.0/27.0);
let denominator = self.log_of_growth * (x3/9.0 + x2 + x + (1.0/9.0));
let ln_x = numerator / denominator;
ln_x.floor() as isize
}
fn find_containing_bucket(cume_weight: f64, map: &SkipMap<isize, HistogramBin>) -> Option<(isize,f64)> {
let mut remaining_weight = cume_weight;
for (k, bucket) in map.iter() {
let weight = bucket.weight;
if weight >= remaining_weight {
return Some((*k, remaining_weight));
}
remaining_weight -= weight;
}
None
}
fn find_containing_bucket_in_range(start_key: isize, end_key: isize, cume_weight: f64, map: &SkipMap<isize, HistogramBin>) -> Option<(isize,f64,f64)> {
let mut remaining_weight = cume_weight;
for (k, bucket) in map.range(Included(&start_key), Excluded(&end_key)) {
let weight = bucket.weight;
if weight >= remaining_weight {
return Some((*k, remaining_weight, bucket.sample));
}
remaining_weight -= weight;
}
None
}
fn adjust_aggregates(&mut self, sample: f64, weight: f64) {
self.adjust_minimum(sample);
self.adjust_maximum(sample);
let weight_is_finite = weight.is_finite();
let sample_is_finite = sample.is_finite();
if weight < 0.0 && weight_is_finite {
self.total_count -= 1;
}
else {
self.total_count += 1;
}
if sample_is_finite && weight_is_finite {
self.total_weight += weight;
self.weighted_sum += weight * sample;
}
if sample_is_finite && sample.fract() != 0.0 {
self.only_integers = false;
}
}
fn over_or_underflow(&self, sample: f64) -> f64 {
if sample > self.negative_underflow && sample < self.positive_underflow {
0.0
}
else if sample > self.positive_overflow {
f64::INFINITY
}
else if sample < self.negative_overflow {
f64::NEG_INFINITY
}
else {
sample
}
}
fn adjust_minimum(&mut self, sample: f64) {
if self.minimum.is_nan() {
if sample.is_finite() {
self.minimum = sample;
}
}
else if sample < self.minimum && sample.is_finite() {
self.minimum = sample;
}
}
fn adjust_maximum(&mut self, sample: f64) {
if self.maximum.is_nan() {
if sample.is_finite() {
self.maximum = sample;
}
}
else if sample > self.maximum && sample.is_finite() {
self.maximum = sample;
}
}
fn adjust_mode(
&mut self,
bin_key: isize,
bin_low_value: f64,
bin_midpoint: f64,
bin_high_value: f64,
bin_weight: f64) {
if bin_midpoint.is_finite() && bin_weight > 0.0 && bin_weight != 1.0 {
self.mode_cache.update(bin_key, bin_low_value, bin_high_value, bin_weight);
}
}
fn increment_key_weight(map: &mut SkipMap<isize, HistogramBin>, key: isize, bucket_sample: f64, accurate_sample: f64, added_weight: f64) -> f64 {
match map.get_mut(&key) {
Some(bucket) => {
let new_bucket = HistogramBin::new(bucket_sample, bucket.weight + added_weight);
bucket.set(new_bucket);
new_bucket.weight
},
None => {
map.insert(key, HistogramBin::new(accurate_sample, added_weight));
added_weight
}
}
}
fn conditional_round(&self, sample: f64) -> f64 {
if self.only_integers && sample.is_finite() {
sample.round()
}
else {
sample
}
}
fn added_weight(&self) -> f64 {
self.total_weight + self.nan_weight + self.plus_ininity_weight + self.minus_ininity_weight
}
pub fn power_of_two(sample: f64) -> Option<isize> {
if sample.is_finite() && sample != 0.0 {
let (_mantissa, exponent) = frexp(sample);
Some(exponent - 1)
}
else {
None
}
}
fn sign_power_remainder(sample: f64) -> Option<(isize, isize, f64)> {
if sample.is_finite() && sample != 0.0 {
let (mantissa, exponent) = frexp(sample);
if sample > 0.0 {
Some((1, exponent - 1, mantissa * 2.0))
}
else {
Some((-1, exponent - 1, mantissa.abs() * 2.0))
}
}
else {
None
}
}
#[inline(always)]
fn two_power(power: isize) -> f64 {
match power {
0..=63 => (1_usize << power) as f64,
64..=127 => (1_u128 << power) as f64,
-63..=-1 => 1.0 / ((1_usize << -power) as f64),
-126..=-64 => 1.0 / ((1_u128 << -power) as f64),
_ => f64::NAN
}
}
fn get_coarse_key_with_midpoint(sample: f64) -> Option<(isize,f64)> {
match Self::sign_power_remainder(sample) {
Some((sign, power, _remainder)) => {
let low = Self::two_power(power); let high = low * 2.0;
let bucket_middle = (sign as f64) * (low + high) / 2.0;
Some((sign * power, bucket_middle))
},
_ => None
}
}
fn get_fine_key_with_midpoint(&self, sample: f64, growth: f64, bins_per_doubling: usize) -> Option<(isize,f64,f64,f64)> {
match Self::sign_power_remainder(sample) {
Some((sign, power, remainder)) => {
let bin = self.pade_approximate_log_floor(remainder);
let key = ((power + 126) * (bins_per_doubling as isize) + bin + 1) * sign;
let two_power = Self::two_power(power); let bucket_low = two_power * self.growth_bin_power_memo[bin as usize]; let bucket_high = bucket_low * growth;
let bucket_middle = (sign as f64) * (bucket_low + bucket_high) / 2.0;
Some((key, bucket_low, bucket_middle, bucket_high))
},
_ => {
if sample == 0.0 {
Some((0, 0.0, 0.0, 0.0))
}
else {
None
}
}
}
}
}
impl Debug for Quantogram {
fn fmt(&self, f: &mut Formatter) -> Result {
let mut fine = "(".to_owned();
for (k, bucket) in self.fine_bins.iter() {
fine = fine + &format!("\n {}->{:.4}:{}", k, bucket.sample, bucket.weight);
}
fine = fine + "\n )";
let mut coarse_plus = "(".to_owned();
for (k, bucket) in self.positive_coarse_bins.iter() {
coarse_plus = coarse_plus + &format!("\n {}->{:.4}:{}", k, bucket.sample, bucket.weight);
}
coarse_plus = coarse_plus + "\n )";
let mut coarse_minus = "(".to_owned();
for (k, bucket) in self.negative_coarse_bins.iter() {
coarse_minus = coarse_minus + &format!("\n {}->{:.4}:{}", k, bucket.sample, bucket.weight);
}
coarse_minus = coarse_minus + "\n )";
write!(f, "Quantogram.
growth = {}, bins per doubling = {}
total count = {}, weight = {}, sum = {}
NAN = {}, -Inf = {}, +Inf = {}, Integers? = {}
- = {}, 0 = {}, + = {}
min = {}, mean = {:?}, max = {}
under/overflow = {}/{}
coarse bins =
Minus {}
Plus {}
fine bins = {}
",
self.growth, self.bins_per_doubling,
self.total_count, self.total_weight, self.weighted_sum,
self.nan_weight, self.minus_ininity_weight, self.plus_ininity_weight, self.only_integers,
self.negative_weight, self.zero_weight, self.positive_weight,
self.minimum, self.mean(), self.maximum,
self.positive_underflow.log2(), self.positive_overflow.log2(),
coarse_minus,
coarse_plus,
fine
)
}
}
#[derive(Copy, Clone, Debug)]
pub struct QuantogramBuilder {
error : f64,
bins_per_doubling : usize,
growth : f64,
smallest_power: isize,
largest_power: isize
}
impl QuantogramBuilder {
pub fn new() -> Self {
QuantogramBuilder {
error: 0.01,
bins_per_doubling: 35,
growth: 1.02,
smallest_power: -126,
largest_power: 127
}
}
pub fn build(self) -> Quantogram {
Quantogram::with_configuration(self.growth, self.bins_per_doubling, self.smallest_power, self.largest_power)
}
pub fn with_smallest_power(mut self, power: isize)-> Self {
self.smallest_power =
if power < -126 { -126 }
else if power >= 127 { 127 }
else { power };
self
}
pub fn with_largest_power(mut self, power: isize)-> Self {
self.largest_power =
if power <= -126 { -125 }
else if power > 127 { 127 }
else { power };
self
}
pub fn with_error(mut self, error: f64) -> Self {
(&mut self).set_from_error(error);
self
}
pub fn with_growth(mut self, growth: f64) -> Self {
(&mut self).set_from_growth(growth);
self
}
pub fn with_bins_per_doubling(mut self, bins: usize) -> Self {
(&mut self).set_from_bins(bins);
self
}
fn set_from_error(&mut self, error: f64) {
let adjusted_error =
if error < 0.00035 { 0.00035 }
else if error > 0.17 { 0.17 }
else { error };
let growth = (1.0 + adjusted_error) / (1.0 - adjusted_error);
let fractional_bins = 1.0 / growth.log2();
self.bins_per_doubling = fractional_bins.ceil() as usize;
self.growth = 2.0_f64.powf(1.0/(self.bins_per_doubling as f64));
self.error = (self.growth - 1.0) / (self.growth + 1.0);
}
fn set_from_growth(&mut self, growth: f64) {
let adjusted_growth =
if growth < 1.00069 { 1.00069 }
else if growth > 1.4143 { 1.4143 }
else { growth };
let fractional_bins = 1.0 / adjusted_growth.log2();
self.bins_per_doubling = fractional_bins.ceil() as usize;
self.growth = 2.0_f64.powf(1.0/(self.bins_per_doubling as f64));
self.error = (self.growth - 1.0) / (self.growth + 1.0);
}
fn set_from_bins(&mut self, bins: usize) {
let adjusted_bins =
if bins < 2 { 2 }
else if bins > 1000 { 1000 }
else { bins };
self.bins_per_doubling = adjusted_bins;
self.growth = 2.0_f64.powf(1.0/(self.bins_per_doubling as f64));
self.error = (self.growth - 1.0) / (self.growth + 1.0);
}
}
#[derive(Copy, Clone, Debug)]
struct BinInfo {
pub bin_key : isize,
pub bin_low_value : f64,
pub bin_high_value : f64,
pub bin_weight : f64
}
impl BinInfo {
pub fn bin_width(&self) -> f64 {
self.bin_high_value - self.bin_low_value
}
pub fn get_key_weight(map: &SkipMap<isize, HistogramBin>, key: isize) -> f64 {
match map.get(&key) {
Some(bucket) => bucket.weight,
None => 0.0
}
}
pub fn mode_of_grouped_data(&self, bins: &SkipMap<isize, HistogramBin>) -> f64 {
let f1 = self.bin_weight;
let f0 = Self::get_key_weight(bins, self.bin_key - 1);
let f2 = Self::get_key_weight(bins, self.bin_key + 1);
let h = self.bin_width();
let l = self.bin_low_value;
let mode = l + h*(f1 - f0)/(2.0 * f1 - f0 - f2);
mode
}
}
struct ModeCache {
modal_classes : Vec<BinInfo>,
weight : f64
}
impl ModeCache {
pub fn new() -> Self {
ModeCache {
modal_classes: Vec::new(),
weight : 0.0
}
}
pub fn update(&mut self, bin_key : isize,
bin_low_value : f64,
bin_high_value : f64,
bin_weight : f64) -> bool {
let bin = BinInfo {
bin_key : bin_key,
bin_low_value : bin_low_value,
bin_high_value : bin_high_value,
bin_weight : bin_weight
};
if self.weight < bin_weight {
self.modal_classes = vec![bin];
self.weight = bin_weight;
true
}
else if self.weight == bin_weight {
self.modal_classes.push(bin);
true
}
else {
false
}
}
pub fn mode(&self, bins: &SkipMap<isize, HistogramBin>) -> Vec<f64> {
let mut modes = Vec::new();
for bin in self.modal_classes.iter() {
let mode_estimate = bin.mode_of_grouped_data(bins);
modes.push(mode_estimate);
}
modes
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::time::{Instant};
fn gapped_quantogram() -> Quantogram {
let mut q = Quantogram::new();
let data = vec![1.0,2.0,3.0,4.0,5.0,95.0,96.0,97.0,98.0,99.0];
q.add_unweighted_samples(data.iter());
q
}
fn randomish(bottom: f64, top:f64, count: usize) -> Vec<f64> {
let mut data : Vec<f64> = Vec::new();
let range = top - bottom;
let phi: f64 = (1.0 + 5.0_f64.sqrt()) / 2.0;
for i in 0..count {
let pseudo_rand = ((i as f64) * phi) % 1.0_f64;
let sample = bottom + pseudo_rand * pseudo_rand * range;
data.push(sample);
}
data
}
fn assert_median(data: &mut Vec<f64>, accuracy: f64) {
let mut q = Quantogram::new();
q.add_unweighted_samples(data.iter());
let actual_median = q.median().unwrap();
data.sort_by(|a, b| a.partial_cmp(b).unwrap());
let expected_median = data[data.len() / 2]; let abs_rel_error = (expected_median - actual_median) / actual_median;
assert!(abs_rel_error <= accuracy);
}
fn profile_median(data: &mut Vec<f64>) -> (u128,u128) {
let t1 = Instant::now();
let mut q = Quantogram::new();
q.add_unweighted_samples(data.iter());
let _actual_median = q.median().unwrap();
let quantogram_time = t1.elapsed().as_micros();
let t2 = Instant::now();
let mut data_copy = Vec::new();
for sample in data {
data_copy.push(*sample);
}
data_copy.sort_by(|a, b| a.partial_cmp(b).unwrap());
let _expected_median = data_copy[data_copy.len() / 2]; let naive_time = t2.elapsed().as_micros();
(quantogram_time, naive_time)
}
fn profile_many_medians(data: &mut Vec<f64>) -> (u128,u128) {
let t1 = Instant::now();
let mut q = Quantogram::new();
for sample in data.iter() {
q.add(*sample);
let _actual_median = q.median().unwrap();
}
let quantogram_time = t1.elapsed().as_micros();
let t2 = Instant::now();
let mut data_copy = Vec::new();
for sample in data {
data_copy.push(*sample);
data_copy.sort_by(|a, b| a.partial_cmp(b).unwrap());
let _expected_median = data_copy[data_copy.len() / 2]; }
let naive_time = t2.elapsed().as_micros();
(quantogram_time, naive_time)
}
fn absolute_relative_error(expected: f64, actual: f64) -> f64 {
(expected - actual).abs() / expected
}
#[test]
fn test_min() { assert_eq!(gapped_quantogram().min().unwrap(), 1.0); }
#[test]
fn test_max() { assert_eq!(gapped_quantogram().max().unwrap(), 99.0); }
#[test]
fn test_mean() { assert_eq!(gapped_quantogram().mean().unwrap(), 50.0); }
#[test]
fn test_mode_unimodal() {
let mut q = Quantogram::new();
let data = vec![1.0,2.0,3.0,3.0,4.0,4.0,4.0,5.0,6.0];
q.add_unweighted_samples(data.iter());
assert_eq!(q.mode(), vec![4.0]);
}
#[test]
fn test_mode_multimodal() {
let mut q = Quantogram::new();
let data = vec![1.0,2.0,3.0,3.0,3.0,4.0,4.0,4.0,5.0,6.0];
q.add_unweighted_samples(data.iter());
assert_eq!(q.mode(), vec![3.0,4.0]);
}
#[test]
fn test_count() {
assert_eq!(Quantogram::new().count(), 0);
assert_eq!(gapped_quantogram().count(), 10);
}
#[test]
fn test_median() { assert_eq!(gapped_quantogram().median().unwrap(), 5.0); }
#[test]
fn test_quantile() {
assert_eq!(gapped_quantogram().quantile(0.0).unwrap(), 1.0);
assert_eq!(gapped_quantogram().quantile(0.75).unwrap(), 96.0);
assert_eq!(gapped_quantogram().quantile(1.0).unwrap(), 99.0);
}
#[test]
fn test_quantile_at() {
let q_mean = |r: Option<(f64,f64)>| { (r.unwrap().0 + r.unwrap().1) / 2.0 };
let mut q = Quantogram::new();
let data: Vec<f64> = vec![0,1,2,3,4,5,6,7,8,9,10]
.into_iter()
.map(|x| x as f64)
.collect();
q.add_unweighted_samples(data.iter());
assert_eq!(0.5, q_mean(q.quantile_at(5.0)));
assert_eq!(17.0/22.0, q_mean(q.quantile_at(8.0)));
}
#[test]
fn test_fussy_median() {
assert_eq!(gapped_quantogram().fussy_quantile(0.5, 2.0).unwrap(), 50.0);
}
#[test]
fn test_remove() {
let mut q = gapped_quantogram();
q.remove(99.0);
q.remove(98.0);
assert_eq!(q.median().unwrap(), 4.0);
}
#[test]
fn test_quantiles_with_negatives() {
let mut q = Quantogram::new();
let data = vec![-1.0,-2.0,-3.0,-4.0,-5.0,-95.0,-96.0,-97.0,-98.0,-99.0, 0.0, 1.0,2.0,3.0,4.0,5.0,95.0,96.0,97.0,98.0,99.0];
q.add_unweighted_samples(data.iter());
assert_eq!(q.median().unwrap(), 0.0);
assert_eq!(q.quantile(0.4).unwrap(), -2.0);
assert_eq!(q.quantile(0.6).unwrap(), 2.0);
}
#[test]
fn test_weighted_quantile() {
let mut q = Quantogram::new();
let data = vec![1.0,2.0,3.0,4.0,5.0,95.0,96.0,97.0,98.0,99.0];
q.add_unweighted_samples(data.iter());
q.add_weighted(0.0, 6.0);
assert_eq!(q.median().unwrap(), 2.0);
}
#[test]
fn test_finite() {
let mut q = Quantogram::new();
q.add(f64::NAN);
assert_eq!(q.finite(), 0.0);
let data = vec![1.0,2.0,3.0,4.0];
q.add_unweighted_samples(data.iter());
assert_eq!(q.finite(), 0.8);
}
#[test]
fn test_nan() {
let mut q = Quantogram::new();
q.add(f64::NAN);
assert_eq!(q.nan(), 1.0);
let data = vec![1.0,2.0,3.0,4.0];
q.add_unweighted_samples(data.iter());
assert_eq!(q.nan(), 0.2);
}
#[test]
fn test_size() {
let mut q = Quantogram::new();
assert_eq!(q.size(), 7);
q.add(1.0);
assert_eq!(q.size(), 9);
q.add(1.0);
assert_eq!(q.size(), 9);
q.add(1.5);
assert_eq!(q.size(), 10);
}
#[test]
fn test_sparsity() {
let mut q = Quantogram::new();
for i in 1..1000000 {
q.add(i as f64);
}
assert!(q.size() < 600);
assert!(absolute_relative_error(q.median().unwrap(), 500000.0) <= 0.01);
}
#[test]
fn test_with_constrained_range() {
let mut q = QuantogramBuilder::new()
.with_smallest_power(-10)
.with_largest_power(10)
.build();
let data = vec![
0.0001, 0.00056, 0.002, 16.0, 1000.0,
6543.0, 15000.0, 2400000.0 ];
q.add_unweighted_samples(data.iter());
assert_eq!(q.min().unwrap(), 0.0);
assert_eq!(q.finite(), 0.625);
assert_eq!(q.zero(), 0.25);
assert_eq!(q.median().unwrap(), 0.002);
}
#[test]
fn test_accuracy() {
let mut data = randomish(-100.0, 2000000.0, 100000);
assert_median(&mut data, 0.01);
}
#[test]
fn test_insert_heavy_speed() {
let mut short_data = randomish(-10000.0, 10000.0, 10000);
let mut long_data = randomish(-10000.0, 10000.0, 1000000);
let (short_quant_time, short_naive_time) = profile_median(&mut short_data);
let (long_quant_time, long_naive_time) = profile_median(&mut long_data);
let short_ratio = (short_quant_time as f64) / (short_naive_time as f64);
let long_ratio = (long_quant_time as f64) / (long_naive_time as f64);
println!("Short Ratio: {} Long ratio: {}", short_ratio, long_ratio);
assert!(short_ratio > 1.4 * long_ratio);
}
#[test]
fn test_median_heavy_speed() {
let mut short_data = randomish(-10000.0, 10000.0, 2000);
let mut long_data = randomish(-10000.0, 10000.0, 20000);
let (short_quant_time, short_naive_time) = profile_many_medians(&mut short_data);
let (long_quant_time, long_naive_time) = profile_many_medians(&mut long_data);
let short_ratio = (short_quant_time as f64) / (short_naive_time as f64);
let long_ratio = (long_quant_time as f64) / (long_naive_time as f64);
println!("Short Ratio: {} Long ratio: {}", short_ratio, long_ratio);
assert!(short_ratio > 8.0 * long_ratio);
}
#[test]
fn test_basics() {
let mut q = Quantogram::new();
q.add(10.0);
q.add(40.0);
q.add(20.0);
q.add(30.0);
q.add(50.0);
assert_eq!(q.count(), 5);
assert_eq!(q.min().unwrap(), 10.0);
assert_eq!(q.max().unwrap(), 50.0);
assert_eq!(q.mean().unwrap(), 30.0);
assert_eq!(q.median().unwrap(), 30.0);
assert_eq!(q.quantile(0.75).unwrap(), 40.0);
q.remove(10.0);
q.remove(20.0);
assert_eq!(q.mean().unwrap(), 40.0);
}
}