asap_sketchlib 0.2.1

//! KLL quantile sketch (compact / insert-optimized variant).
//!
//! Insertion and compaction follow the compact KLL layout from:
//! "Insert-optimized implementation of streaming data sketches" (Pfeil et al., 2025).
//! CDF construction follows the pattern described in dgryski/go-kll, based on the
//! weighted CDF approach from the original KLL paper (Karnin, Lang & Liberty, FOCS 2016).
//!
//! References:
//! - Karnin, Lang & Liberty, "Optimal Quantile Approximation in Streams," FOCS 2016.
//!   <https://arxiv.org/abs/1603.05346>
//! - <https://www.amazon.science/publications/insert-optimized-implementation-of-streaming-data-sketches>

use rand::{Rng, rng};
use rmp_serde::decode::Error as RmpDecodeError;
use rmp_serde::encode::Error as RmpEncodeError;
use serde::{Deserialize, Serialize};

use crate::common::input::data_input_to_f64;
use crate::common::numerical::NumericalValue;
use crate::{DataInput, Vector1D};

const MAX_LEVELS: usize = 61;

const CAPACITY_CACHE_LEN: usize = 20;
const MAX_CACHEABLE_K: usize = 26_602;
const CAPACITY_DECAY: f64 = 2.0 / 3.0;
const DEFAULT_K: i32 = 200;

/// Coin generates deterministic pseudo-random coin flips while amortizing
/// calls to the RNG by consuming one bit at a time from a 64-bit buffer.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub(crate) struct Coin {
    state: u64,
    bit_cache: u64,
    #[serde(default)]
    remaining_bits: u8,
}

impl Coin {
    pub fn new() -> Self {
        let mut rng = rng();
        Self::from_seed(rng.random::<u64>())
    }

    pub fn xorshift_mult64(mut x: u64) -> u64 {
        x ^= x >> 12;
        x ^= x << 25;
        x ^= x >> 27;
        x.wrapping_mul(2685821657736338717)
    }

    pub(crate) fn from_seed(seed: u64) -> Self {
        Self {
            state: Self::normalize_seed(seed),
            bit_cache: 0,
            remaining_bits: 0,
        }
    }

    #[inline]
    fn normalize_seed(seed: u64) -> u64 {
        const FALLBACK: u64 = 0x9e37_79b9_7f4a_7c15;
        if seed == 0 { FALLBACK } else { seed }
    }

    #[inline]
    fn refill(&mut self) {
        self.state = Self::normalize_seed(Self::xorshift_mult64(self.state));
        self.bit_cache = self.state;
        self.remaining_bits = u64::BITS as u8;
    }

    pub fn toss(&mut self) -> bool {
        if self.remaining_bits == 0 {
            self.refill();
        }
        let bit = (self.bit_cache & 1) != 0;
        self.bit_cache >>= 1;
        self.remaining_bits -= 1;
        bit
    }
}

/// A single (value, cumulative-quantile) pair in a CDF table.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct CdfEntry {
    value: f64,
    quantile: f64,
}

/// Computes the maximum number of items the sketch can hold across all
/// levels for the given `k` and `m`. The buffer is pre-allocated to this
/// size so that no dynamic reallocation ever occurs.
fn compute_max_capacity(k: usize, m: usize) -> usize {
    let mut total = 0;
    let mut scale = 1.0_f64;
    for _ in 0..MAX_LEVELS {
        total += ((k as f64) * scale).ceil().max(m as f64) as usize;
        scale *= CAPACITY_DECAY;
    }
    total
}

/// Halves a sorted run, placing survivors in the **upper** (right) half of
/// `items[begin..begin+pop]` so they are contiguous with the level above.
/// Traverses backwards to avoid overwriting unread source elements.
#[inline]
fn randomly_halve_up<T: Copy>(items: &mut [T], begin: usize, pop: usize, offset: usize) -> usize {
    let num_survivors = (pop - offset).div_ceil(2);
    let dest = begin + pop - num_survivors;
    for d in (0..num_survivors).rev() {
        items[dest + d] = items[begin + offset + 2 * d];
    }
    num_survivors
}

/// Merges two contiguous sorted runs in `slice` using `NumericalValue::total_cmp`.
/// `slice[..left_len]` is the first sorted run, `slice[left_len..]` is the
/// second.  `buf` is a reusable scratch buffer.
#[inline]
fn merge_sorted_runs<T: NumericalValue>(slice: &mut [T], left_len: usize, buf: &mut Vec<T>) {
    let total = slice.len();
    if left_len == 0 || left_len >= total {
        return;
    }
    if slice[left_len - 1].total_cmp(&slice[left_len]).is_le() {
        return;
    }

    let right_len = total - left_len;
    buf.clear();

    if left_len <= right_len {
        buf.extend_from_slice(&slice[..left_len]);
        let mut i = 0;
        let mut j = left_len;
        let mut k = 0;
        while i < buf.len() && j < total {
            if buf[i].total_cmp(&slice[j]).is_le() {
                slice[k] = buf[i];
                i += 1;
            } else {
                slice[k] = slice[j];
                j += 1;
            }
            k += 1;
        }
        if i < buf.len() {
            slice[k..k + (buf.len() - i)].copy_from_slice(&buf[i..]);
        }
    } else {
        buf.extend_from_slice(&slice[left_len..]);
        let mut i = left_len;
        let mut j = buf.len();
        let mut k = total;
        while i > 0 && j > 0 {
            k -= 1;
            if buf[j - 1].total_cmp(&slice[i - 1]).is_ge() {
                slice[k] = buf[j - 1];
                j -= 1;
            } else {
                slice[k] = slice[i - 1];
                i -= 1;
            }
        }
        if j > 0 {
            slice[..j].copy_from_slice(&buf[..j]);
        }
    }
}

// ---------------------------------------------------------------------------
// KLL sketch
// ---------------------------------------------------------------------------

/// Compact, insert-optimized KLL quantile sketch.
///
/// Memory layout (grows leftward):
/// ```text
/// items: [ free ← | L0 (unsorted) | L1 | L2 | … | L_top ]
///         0        levels[0]                        levels[num_levels]
/// ```
///
/// `levels[h]` = start of level h.  `levels[h+1] - levels[h]` = size of level h.
#[derive(Clone, Debug)]
pub struct KLL<T: NumericalValue = f64> {
    items: Box<[T]>,
    levels: Box<[usize]>,
    k: usize,
    m: usize,
    num_levels: usize,
    max_capacity: usize,
    co: Coin,
    /// Explicit seed for the compaction RNG, when present. When `Some`, the
    /// sketch's bytes are reproducible across runs and `clear()` re-seeds
    /// from this value. When `None`, the legacy time-based path is used and
    /// `clear()` re-seeds from the wall clock.
    seed: Option<u64>,
    capacity_cache: [u32; CAPACITY_CACHE_LEN],
    top_height: usize,
    level0_capacity: usize,
    merge_buf: Vec<T>,
}

impl<T: NumericalValue> Default for KLL<T> {
    fn default() -> Self {
        Self::init_kll(DEFAULT_K)
    }
}

impl<T: NumericalValue> KLL<T> {
    /// Creates a new KLL sketch with accuracy parameter `k` and minimum level capacity `m`.
    pub fn init(k: usize, m: usize) -> Self {
        Self::init_internal(k, m, Coin::new(), None)
    }

    /// Creates a new KLL sketch with an explicit RNG seed for the compaction
    /// coin. Two sketches built with the same seed and fed the same input
    /// sequence produce byte-identical serialized state — required for
    /// reproducible-replay scenarios and cross-process parity tests. The seed
    /// is also stored so `clear()` re-seeds deterministically across window
    /// rotations rather than silently jumping back to the wall clock.
    ///
    /// Callers that don't care about determinism should keep using `init` /
    /// `init_kll`.
    pub fn init_with_seed(k: usize, m: usize, seed: u64) -> Self {
        Self::init_internal(k, m, Coin::from_seed(seed), Some(seed))
    }

    /// Creates a new KLL sketch with the given `k` and a default minimum level capacity of 8.
    pub fn init_kll(k: i32) -> Self {
        Self::init(k as usize, 8)
    }

    /// `init_kll` with an explicit RNG seed. See `init_with_seed`.
    pub fn init_kll_with_seed(k: i32, seed: u64) -> Self {
        Self::init_with_seed(k as usize, 8, seed)
    }

    fn init_internal(k: usize, m: usize, coin: Coin, seed: Option<u64>) -> Self {
        let mut norm_m = m.min(MAX_CACHEABLE_K);
        norm_m = norm_m.max(2);
        let mut norm_k = k.max(norm_m);
        if norm_k > MAX_CACHEABLE_K {
            norm_k = MAX_CACHEABLE_K;
        }
        let max_cap = compute_max_capacity(norm_k, norm_m);
        let mut s = Self {
            items: vec![T::default(); max_cap].into_boxed_slice(),
            levels: {
                let mut v = vec![0usize; MAX_LEVELS + 1];
                v[0] = max_cap;
                v[1] = max_cap;
                v.into_boxed_slice()
            },
            k: norm_k,
            m: norm_m,
            num_levels: 1,
            max_capacity: max_cap,
            co: coin,
            seed,
            capacity_cache: [0; CAPACITY_CACHE_LEN],
            top_height: 0,
            level0_capacity: 0,
            merge_buf: Vec::with_capacity(norm_k),
        };
        s.rebuild_capacity_cache();
        s
    }

    /// Hot-path insert: decrement `levels[0]`, write item, check capacity.
    #[inline]
    fn push_value(&mut self, value: T) {
        if self.levels[0] == 0 {
            self.compress_while_updating();
        }
        self.levels[0] -= 1;
        self.items[self.levels[0]] = value;

        if self.levels[1] - self.levels[0] > self.level0_capacity {
            self.compress_while_updating();
        }
    }

    /// Inserts a typed numeric value into the sketch.
    pub fn update(&mut self, val: &T) {
        self.push_value(*val);
    }

    // -- Compaction ----------------------------------------------------------

    fn compress_while_updating(&mut self) {
        let mut h = 0;
        loop {
            let pop = self.level_size(h);
            let cap = self.capacity_for_level(h);
            if pop <= cap {
                break;
            }
            if h + 1 == self.num_levels {
                self.add_new_top_level();
            }
            self.compact(h);
            h += 1;
        }
    }

    fn compact(&mut self, h: usize) {
        let beg = self.levels[h];
        let end = self.levels[h + 1];
        let pop = end - beg;

        if h == 0 {
            self.items[beg..end].sort_unstable_by(T::total_cmp);
        }

        let offset = usize::from(self.co.toss());
        let num_survivors = randomly_halve_up(&mut self.items, beg, pop, offset);
        let surv_start = beg + pop - num_survivors;

        let pop_above = self.levels[h + 2] - end;
        if pop_above > 0 {
            merge_sorted_runs(
                &mut self.items[surv_start..end + pop_above],
                num_survivors,
                &mut self.merge_buf,
            );
        }

        let delta = surv_start - beg;
        if delta > 0 && h > 0 {
            let lo = self.levels[0];
            let hi = beg;
            if hi > lo {
                self.items.copy_within(lo..hi, lo + delta);
            }
            for lvl in self.levels[..h].iter_mut() {
                *lvl += delta;
            }
        }

        self.levels[h] = surv_start;
        self.levels[h + 1] = surv_start;
    }

    fn add_new_top_level(&mut self) {
        let sentinel = self.levels[self.num_levels];
        self.num_levels += 1;
        self.levels[self.num_levels] = sentinel;
        self.top_height = self.num_levels - 1;
        self.level0_capacity = self.capacity_for_level(0);
    }

    // -- Capacity helpers ----------------------------------------------------

    fn capacity_for_level(&self, level: usize) -> usize {
        if self.num_levels == 0 {
            return self.m;
        }
        let height_from_top = self.top_height.saturating_sub(level);
        let idx = height_from_top.min(CAPACITY_CACHE_LEN - 1);
        self.capacity_cache[idx] as usize
    }

    fn rebuild_capacity_cache(&mut self) {
        self.top_height = self.num_levels.saturating_sub(1);
        let mut scale = 1.0_f64;
        for idx in 0..CAPACITY_CACHE_LEN {
            let scaled = ((self.k as f64) * scale).ceil() as usize;
            let cap = scaled.max(self.m);
            self.capacity_cache[idx] = cap as u32;
            scale *= CAPACITY_DECAY;
        }
        self.level0_capacity = self.capacity_for_level(0);
    }

    #[inline]
    fn level_size(&self, h: usize) -> usize {
        self.levels[h + 1] - self.levels[h]
    }

    // -- Query-side ----------------------------------------------------------

    /// Builds and returns the cumulative distribution function (CDF) from the current sketch state.
    pub fn cdf(&self) -> Cdf {
        let mut cdf = Cdf {
            entries: Vector1D::init(self.buffer_size()),
        };
        let mut total_w = 0usize;

        for h in 0..self.num_levels {
            let start = self.levels[h];
            let end = self.levels[h + 1];
            let weight = 1 << h;
            for &value in &self.items[start..end] {
                cdf.entries.push(CdfEntry {
                    value: value.to_f64(),
                    quantile: weight as f64,
                });
            }
            total_w += (end - start) * weight;
        }

        if total_w == 0 {
            return cdf;
        }

        cdf.entries
            .as_mut_slice()
            .sort_by(|a, b| a.value.partial_cmp(&b.value).unwrap());

        let mut cur_w = 0.0;
        for entry in cdf.entries.as_mut_slice() {
            cur_w += entry.quantile;
            entry.quantile = cur_w / total_w as f64;
        }

        cdf
    }

    /// Merges all items from another KLL sketch into this one.
    pub fn merge(&mut self, other: &KLL<T>) {
        let used_start = other.levels[0];
        let used_end = other.levels[other.num_levels];
        for &value in &other.items[used_start..used_end] {
            self.push_value(value);
        }
    }

    /// Returns the estimated value at quantile `q` (in `[0, 1]`).
    pub fn quantile(&self, q: f64) -> f64 {
        let cdf = self.cdf();
        cdf.query(q)
    }

    /// Returns the estimated (weighted) rank of value `x`.
    pub fn rank(&self, x: f64) -> usize {
        let mut r = 0;
        for h in 0..self.num_levels {
            let start = self.levels[h];
            let end = self.levels[h + 1];
            let weight = 1 << h;
            for &val in &self.items[start..end] {
                if val.to_f64() <= x {
                    r += weight;
                }
            }
        }
        r
    }

    /// Returns the configured compactor capacity `k`.
    pub fn k(&self) -> usize {
        self.k
    }

    /// Returns the total (weighted) number of items ingested by the sketch.
    pub fn count(&self) -> usize {
        let mut total = 0;
        for h in 0..self.num_levels {
            total += self.level_size(h) * (1 << h);
        }
        total
    }

    fn buffer_size(&self) -> usize {
        self.levels[self.num_levels] - self.levels[0]
    }

    // -- Lifecycle -----------------------------------------------------------

    /// Resets the sketch to its empty initial state, keeping the same `k` and `m` parameters.
    /// If the sketch was constructed with an explicit seed (`init_with_seed` /
    /// `init_kll_with_seed`), the coin is re-seeded from that seed so determinism
    /// survives `clear()` (and therefore window rotation in stateful aggregators).
    /// Otherwise the coin is re-seeded from the wall clock — the historical
    /// behavior.
    pub fn clear(&mut self) {
        let mc = self.max_capacity;
        self.levels[0] = mc;
        self.levels[1] = mc;
        self.num_levels = 1;
        self.co = match self.seed {
            Some(s) => Coin::from_seed(s),
            None => Coin::new(),
        };
        self.rebuild_capacity_cache();
    }

    /// Prints compactor contents for debugging.
    pub fn print_compactors(&self)
    where
        T: std::fmt::Debug,
    {
        println!(
            "KLL Packed (k={}, levels={}, items={})",
            self.k,
            self.num_levels,
            self.buffer_size()
        );
        for h in (0..self.num_levels).rev() {
            let start = self.levels[h];
            let end = self.levels[h + 1];
            println!("  L{}: {:?}", h, &self.items[start..end]);
        }
    }

    // -- Wire-format-aligned accessors --------------------------------------

    /// Returns the configured `k` parameter.
    #[inline]
    pub fn wire_k(&self) -> u32 {
        self.k as u32
    }

    /// Returns the configured `m` parameter.
    #[inline]
    pub fn wire_m(&self) -> u32 {
        self.m as u32
    }

    /// Returns the number of currently populated compactor levels.
    #[inline]
    pub fn wire_num_levels(&self) -> u32 {
        self.num_levels as u32
    }

    /// Returns the compaction-coin state in `sketchlib-go::CoinState`
    /// shape: `(state, bit_cache, remaining_bits)`. Wire-format wrappers
    /// pack this directly into the `KLLState.coin` proto field; without
    /// it the coin state would have to be poked from private internals
    /// or reconstructed by a serde round-trip, neither of which is
    /// stable across `asap_sketchlib` releases.
    #[inline]
    pub fn wire_coin(&self) -> (u64, u64, u32) {
        (
            self.co.state,
            self.co.bit_cache,
            self.co.remaining_bits as u32,
        )
    }

    /// Returns the level-boundary array in `sketchlib-go`'s wire shape:
    /// length `num_levels + 1`, starting at 0 and ending at the total
    /// number of retained items. The chunk
    /// `wire_items()[wire_levels()[i] .. wire_levels()[i+1]]` is the
    /// **top-most-first** run for the proto's `KLLState.levels`/`items`
    /// fields, matching `sketchlib-go::KLLSketch.SerializePortable`.
    /// See the `KLLState` docstring in `proto/kll/kll.proto`: index `i`
    /// in `levels` maps to compactor level `num_levels - 1 - i` in the
    /// in-memory representation. Closes part of
    /// ProjectASAP/ASAPCollector#243.
    pub fn wire_levels(&self) -> Vec<u32> {
        // Walk from top compactor-level downward, accumulating sizes.
        let n = self.num_levels;
        let mut out = Vec::with_capacity(n + 1);
        out.push(0u32);
        let mut acc = 0u32;
        for h in (0..n).rev() {
            let size = (self.levels[h + 1] - self.levels[h]) as u32;
            acc += size;
            out.push(acc);
        }
        out
    }

    /// Returns the retained items in `sketchlib-go`'s wire shape:
    /// concatenated top-most-level-first. Sketchlib-go pushes inputs
    /// into the unsorted L0 in input order; `asap_sketchlib`'s compact
    /// layout instead grows L0 leftward, so the buffer reads
    /// reverse-input-order. This accessor reverses the unsorted L0 run
    /// so the emitted byte sequence is identical to Go's for the same
    /// input stream (when no compaction has yet occurred). Higher
    /// levels are sorted in both producers and emitted as-is.
    ///
    /// Caveat: after L0 → L1 compaction the two producers' L1 content
    /// orderings diverge (Go's `compact` leaves L1 as two concatenated
    /// sorted runs; `asap_sketchlib` merge-sorts on the way up). The
    /// retained set is identical and quantile semantics agree, but
    /// strict byte parity past the first compaction is not guaranteed.
    /// The cross-language byte-parity test in
    /// `ASAPCollector::cross_language_parity::kll_byte_parity_with_go`
    /// uses `(1..=50)` with `k=200`, well below the L0 capacity, so
    /// this caveat does not affect the parity guard.
    pub fn wire_items(&self) -> Vec<T> {
        let mut out = Vec::with_capacity(self.buffer_size());
        for h in (0..self.num_levels).rev() {
            let start = self.levels[h];
            let end = self.levels[h + 1];
            if h == 0 {
                // Unsorted L0 in `asap_sketchlib` reads reverse of the
                // input order because `push_value` decrements
                // `levels[0]` before each write. Reverse here so the
                // emitted bytes match Go's input-order L0 layout.
                out.extend(self.items[start..end].iter().rev().copied());
            } else {
                out.extend_from_slice(&self.items[start..end]);
            }
        }
        out
    }

    // -- Serialization -------------------------------------------------------

    /// Serializes the sketch to a MessagePack byte vector.
    pub fn serialize_to_bytes(&self) -> Result<Vec<u8>, RmpEncodeError>
    where
        T: Serialize,
    {
        rmp_serde::to_vec(self)
    }

    /// Deserializes a KLL sketch from a MessagePack byte slice.
    pub fn deserialize_from_bytes(bytes: &[u8]) -> Result<Self, RmpDecodeError>
    where
        T: for<'de> Deserialize<'de>,
    {
        rmp_serde::from_slice(bytes)
    }

    fn ensure_levels_sorted(&mut self) {
        if self.num_levels <= 1 {
            return;
        }
        for h in 1..self.num_levels {
            let s = self.levels[h];
            let e = self.levels[h + 1];
            if s < e {
                self.items[s..e].sort_unstable_by(T::total_cmp);
            }
        }
    }
}

/// Wire format for serialization (only the used portion of the buffer).
#[derive(Serialize, Deserialize)]
struct KLLWire<T> {
    items: Vec<T>,
    levels: Vec<usize>,
    k: usize,
    m: usize,
    num_levels: usize,
    co: Coin,
}

impl<T: NumericalValue + Serialize> Serialize for KLL<T> {
    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
        let used_start = self.levels[0];
        let used_end = self.levels[self.num_levels];
        let wire = KLLWire {
            items: self.items[used_start..used_end].to_vec(),
            levels: self.levels[..=self.num_levels]
                .iter()
                .map(|&l| l - used_start)
                .collect(),
            k: self.k,
            m: self.m,
            num_levels: self.num_levels,
            co: self.co.clone(),
        };
        wire.serialize(serializer)
    }
}

impl KLL<f64> {
    /// Inserts a value from a [`DataInput`] into a `KLL<f64>` sketch.
    ///
    /// This adapter exists for the `HydraCounter` dispatch path, which stores a
    /// type-erased `DataInput`. Non-numeric variants return an error.
    pub fn update_data_input(&mut self, val: &DataInput) -> Result<(), &'static str> {
        let value = data_input_to_f64(val)?;
        self.push_value(value);
        Ok(())
    }
}

impl<'de, T: NumericalValue + Deserialize<'de>> Deserialize<'de> for KLL<T> {
    fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
        let wire = KLLWire::<T>::deserialize(deserializer)?;
        let max_cap = compute_max_capacity(wire.k, wire.m);
        let used_len = wire.items.len();
        let offset = max_cap - used_len;

        let mut items = vec![T::default(); max_cap].into_boxed_slice();
        items[offset..offset + used_len].copy_from_slice(&wire.items);

        let mut levels = vec![0usize; MAX_LEVELS + 1].into_boxed_slice();
        for (i, &l) in wire.levels.iter().enumerate() {
            levels[i] = l + offset;
        }

        let mut sketch = KLL {
            items,
            levels,
            k: wire.k,
            m: wire.m,
            num_levels: wire.num_levels,
            max_capacity: max_cap,
            co: wire.co,
            // Wire format does not carry the explicit-seed flag — a
            // round-tripped sketch keeps its current coin state but
            // would re-randomize from time on a subsequent clear().
            // Callers that need clear()-determinism after a deserialize
            // should rebuild via init_with_seed.
            seed: None,
            capacity_cache: [0; CAPACITY_CACHE_LEN],
            top_height: 0,
            level0_capacity: 0,
            merge_buf: Vec::with_capacity(wire.k),
        };
        sketch.rebuild_capacity_cache();
        sketch.ensure_levels_sorted();
        Ok(sketch)
    }
}

/// The CDF for quantile queries.
pub struct Cdf {
    entries: Vector1D<CdfEntry>,
}

impl Cdf {
    /// Returns the quantile for value `x` using the CDF table.
    pub fn quantile(&self, x: f64) -> f64 {
        if self.entries.is_empty() {
            return 0.0;
        }
        let slice = self.entries.as_slice();
        match slice
            .binary_search_by(|e| e.value.partial_cmp(&x).unwrap_or(std::cmp::Ordering::Less))
        {
            Ok(idx) => slice[idx].quantile,
            Err(0) => 0.0,
            Err(idx) => slice[idx - 1].quantile,
        }
    }

    /// Prints the CDF entries for debugging.
    pub fn print_entries(&self) {
        println!("entries: {:?}", self.entries);
    }

    /// Returns the estimated value corresponding to quantile `p`.
    pub fn query(&self, p: f64) -> f64 {
        // println!("{:?}", self.entries);
        if self.entries.is_empty() {
            return 0.0;
        }
        let slice = self.entries.as_slice();
        match slice.binary_search_by(|e| {
            e.quantile
                .partial_cmp(&p)
                .unwrap_or(std::cmp::Ordering::Less)
        }) {
            Ok(idx) => {
                // println!("idx: {idx}");
                slice[idx].value
            }
            Err(idx) if idx == slice.len() => {
                // println!("ERR1: idx: {idx}");
                slice[slice.len() - 1].value
            }
            Err(idx) => {
                // println!("ERR2: idx: {idx}");
                slice[idx].value
            }
        }
    }

    /// Quantile estimation of value `x` using linear interpolation.
    pub fn quantile_li(&self, x: f64) -> f64 {
        let slice = self.entries.as_slice();
        if slice.is_empty() {
            return 0.0;
        }
        let idx = slice.partition_point(|e| e.value < x);
        if idx == slice.len() {
            return 1.0;
        }
        if idx == 0 {
            return 0.0;
        }
        let a = slice[idx - 1].value;
        let aq = slice[idx - 1].quantile;
        let b = slice[idx].value;
        let bq = slice[idx].quantile;
        ((a - x) * bq + (x - b) * aq) / (a - b)
    }

    /// Value estimation given quantile `p`, using linear interpolation.
    pub fn query_li(&self, p: f64) -> f64 {
        let slice = self.entries.as_slice();
        if slice.is_empty() {
            return 0.0;
        }
        let idx = slice.partition_point(|e| e.quantile < p);
        if idx == slice.len() {
            return slice[slice.len() - 1].value;
        }
        if idx == 0 {
            return slice[0].value;
        }
        let a = slice[idx - 1].value;
        let aq = slice[idx - 1].quantile;
        let b = slice[idx].value;
        let bq = slice[idx].quantile;
        ((aq - p) * b + (p - bq) * a) / (aq - bq)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::test_utils::{sample_uniform_f64, sample_zipf_f64};

    // Ensure each 64-bit chunk is consumed bit-by-bit before refilling.
    #[test]
    fn coin_bit_cache_behavior() {
        let seed = 0x0123_4567_89ab_cdef;
        let mut coin = Coin::from_seed(seed);
        let mut expected_state = Coin::normalize_seed(seed);

        for block in 0..3 {
            expected_state = Coin::normalize_seed(Coin::xorshift_mult64(expected_state));
            for bit in 0..64 {
                let expected = ((expected_state >> bit) & 1) != 0;
                assert_eq!(
                    coin.toss(),
                    expected,
                    "mismatch at block {block}, bit {bit}"
                );
            }
        }
    }

    // Zero seeds must map to a valid state and never fall back to zero.
    #[test]
    fn coin_state_never_zero() {
        let mut coin = Coin::from_seed(0);
        assert_ne!(coin.state, 0);

        for _ in 0..128 {
            coin.toss();
            assert_ne!(coin.state, 0);
        }
    }

    // Two sketches built with the same seed and fed the same input
    // sequence must produce byte-identical serialized state. Without
    // the seedable constructor this is impossible because Coin::new()
    // pulls from a non-deterministic source.
    #[test]
    fn seeded_sketches_are_byte_identical() {
        const SEED: u64 = 42;
        let values = sample_uniform_f64(0.0, 1_000_000.0, 5000, 7);

        let mut a: KLL<f64> = KLL::init_kll_with_seed(SKETCH_K, SEED);
        let mut b: KLL<f64> = KLL::init_kll_with_seed(SKETCH_K, SEED);
        for v in &values {
            a.update(v);
            b.update(v);
        }
        let bytes_a = a.serialize_to_bytes().expect("serialize a");
        let bytes_b = b.serialize_to_bytes().expect("serialize b");
        assert_eq!(
            bytes_a, bytes_b,
            "seeded KLL sketches with identical inputs produced different bytes"
        );
    }

    // Different seeds must (with high probability) drive different
    // compaction outcomes — proves the seed actually propagates into
    // toss() rather than being a no-op stored on the struct.
    #[test]
    fn different_seeds_produce_different_bytes() {
        let values = sample_uniform_f64(0.0, 1_000_000.0, 5000, 11);

        let mut a: KLL<f64> = KLL::init_kll_with_seed(SKETCH_K, 1);
        let mut b: KLL<f64> = KLL::init_kll_with_seed(SKETCH_K, 2);
        for v in &values {
            a.update(v);
            b.update(v);
        }
        let bytes_a = a.serialize_to_bytes().expect("serialize a");
        let bytes_b = b.serialize_to_bytes().expect("serialize b");
        assert_ne!(
            bytes_a, bytes_b,
            "seeds 1 and 2 should not produce identical sketch bytes"
        );
    }

    // clear() on a seeded sketch must re-seed from the stored seed,
    // not from the wall clock. Otherwise determinism evaporates after
    // the first window rotation in stateful aggregators that reuse a
    // KLL across windows.
    #[test]
    fn clear_preserves_seed_determinism() {
        const SEED: u64 = 1234;
        let values = sample_uniform_f64(0.0, 1_000_000.0, 3000, 13);

        let mut a: KLL<f64> = KLL::init_kll_with_seed(SKETCH_K, SEED);
        // Burn a partial window with unrelated data, then clear.
        let noise = sample_uniform_f64(0.0, 1_000_000.0, 1500, 99);
        for v in &noise {
            a.update(v);
        }
        a.clear();

        let mut b: KLL<f64> = KLL::init_kll_with_seed(SKETCH_K, SEED);
        for v in &values {
            a.update(v);
            b.update(v);
        }
        let bytes_a = a.serialize_to_bytes().expect("serialize a");
        let bytes_b = b.serialize_to_bytes().expect("serialize b");
        assert_eq!(
            bytes_a, bytes_b,
            "clear() lost determinism: post-clear sketch diverges from a fresh seeded sketch"
        );
    }

    #[derive(Clone, Copy)]
    enum TestDistribution {
        Uniform {
            min: f64,
            max: f64,
        },
        Zipf {
            min: f64,
            max: f64,
            domain: usize,
            exponent: f64,
        },
    }

    const SKETCH_K: i32 = 200;

    fn build_kll_with_distribution(
        k: i32,
        sample_size: usize,
        distribution: TestDistribution,
        seed: u64,
    ) -> (KLL, Vec<f64>) {
        let mut sketch = KLL::init_kll(k);
        let values = match distribution {
            TestDistribution::Uniform { min, max } => {
                sample_uniform_f64(min, max, sample_size, seed)
            }
            TestDistribution::Zipf {
                min,
                max,
                domain,
                exponent,
            } => sample_zipf_f64(min, max, domain, exponent, sample_size, seed),
        };

        for &value in &values {
            sketch.update_data_input(&DataInput::F64(value)).unwrap();
        }

        (sketch, values)
    }

    // return element from input with given quantile
    fn quantile_from_sorted(data: &[f64], quantile: f64) -> f64 {
        assert!(!data.is_empty(), "data set must not be empty");
        if quantile <= 0.0 {
            return data[0];
        }
        if quantile >= 1.0 {
            return data[data.len() - 1];
        }
        let n = data.len();
        let idx = ((quantile * n as f64).ceil() as isize - 1).clamp(0, (n - 1) as isize) as usize;
        data[idx]
    }

    fn assert_quantiles_within_error(
        sketch: &KLL,
        sorted_truth: &[f64],
        quantiles: &[(f64, &str)],
        tolerance: f64,
        context: &str,
        sample_size: usize,
        seed: u64,
    ) {
        let cdf = sketch.cdf();
        for &(quantile, label) in quantiles {
            let lower_q = (quantile - tolerance).max(0.0);
            let upper_q = (quantile + tolerance).min(1.0);
            let truth_min = quantile_from_sorted(sorted_truth, lower_q);
            let truth_max = quantile_from_sorted(sorted_truth, upper_q);
            let estimate = cdf.query(quantile);
            assert!(
                (truth_min..=truth_max).contains(&estimate),
                "{label} exceeded tolerance: context={context}, sample_size={sample_size}, seed=0x{seed:08x}, \
                quantile={quantile:.4}, truth_min={truth_min:.4}, truth_max={truth_max:.4}, \
                estimate={estimate:.4}, tolerance={tolerance:.4}, total_length={}",
                sorted_truth.len()
            );
        }
    }

    #[test]
    fn distributions_quantiles_stay_within_rank_error() {
        const TOLERANCE: f64 = 0.02;
        const SAMPLE_SIZES: &[usize] = &[1_000, 5_000, 20_000, 100_000, 1_000_000, 5_000_000];
        const QUANTILES: &[(f64, &str)] = &[
            (0.0, "min"),
            (0.10, "p10"),
            (0.25, "p25"),
            (0.50, "p50"),
            (0.75, "p75"),
            (0.90, "p90"),
            (1.0, "max"),
        ];

        struct Case {
            name: &'static str,
            distribution: TestDistribution,
            seed_base: u64,
        }

        let cases = [
            Case {
                name: "uniform",
                distribution: TestDistribution::Uniform {
                    min: 0.0,
                    max: 100_000_000.0,
                },
                seed_base: 0xA5A5_0000,
            },
            Case {
                name: "zipf",
                distribution: TestDistribution::Zipf {
                    min: 1_000_000.0,
                    max: 10_000_000.0,
                    domain: 8_192,
                    exponent: 1.1,
                },
                seed_base: 0xB4B4_0000,
            },
        ];

        for case in cases {
            for (idx, &sample_size) in SAMPLE_SIZES.iter().enumerate() {
                let seed = case.seed_base + idx as u64;
                let (sketch, mut values) =
                    build_kll_with_distribution(SKETCH_K, sample_size, case.distribution, seed);
                values.sort_by(|a, b| a.partial_cmp(b).unwrap());
                assert_quantiles_within_error(
                    &sketch,
                    &values,
                    QUANTILES,
                    TOLERANCE,
                    case.name,
                    sample_size,
                    seed,
                );
            }
        }
    }

    #[test]
    fn test_data_input_api() {
        let mut kll = KLL::init_kll(128);

        // Test with different numeric types
        kll.update_data_input(&DataInput::I32(10)).unwrap();
        kll.update_data_input(&DataInput::I64(20)).unwrap();
        kll.update_data_input(&DataInput::F64(30.5)).unwrap();
        kll.update_data_input(&DataInput::F32(40.2)).unwrap();
        kll.update_data_input(&DataInput::U32(50)).unwrap();

        // Query quantiles
        let cdf = kll.cdf();
        // kll.print_compactors();
        let median = cdf.query(0.5);

        // Median should be 30.5
        assert!(median > 20.0 && median < 40.2, "Median = {}", median);

        // Test error handling for non-numeric input
        let result = kll.update_data_input(&DataInput::String("not a number".to_string()));
        assert!(result.is_err());
        assert_eq!(
            result.unwrap_err(),
            "KLL sketch only accepts numeric inputs"
        );
    }

    #[test]
    fn test_forced_compact() {
        // force compaction to happen with small k/m
        let mut kll = KLL::init(3, 3);
        // kll.print_compactors();
        kll.update_data_input(&DataInput::F64(10.0)).unwrap();
        // kll.print_compactors();
        kll.update_data_input(&DataInput::F64(20.0)).unwrap();
        // kll.print_compactors();
        kll.update_data_input(&DataInput::F64(30.0)).unwrap();
        // kll.print_compactors();
        kll.update_data_input(&DataInput::F64(40.0)).unwrap();
        // kll.print_compactors();
        kll.update_data_input(&DataInput::F64(50.0)).unwrap();
        // kll.print_compactors();
        let cdf = kll.cdf();
        // cdf.print_entries();
        let median = cdf.query(0.5);
        // only 30 and 40 is possible
        assert!(median == 30.0 || median == 40.0, "Median = {}", median);
    }

    #[test]
    fn test_no_compact() {
        // no compaction should happen
        let mut kll = KLL::init_kll(8);
        // kll.print_compactors();
        kll.update_data_input(&DataInput::F64(10.0)).unwrap();
        // kll.print_compactors();
        kll.update_data_input(&DataInput::F64(20.0)).unwrap();
        // kll.print_compactors();
        kll.update_data_input(&DataInput::F64(30.0)).unwrap();
        // kll.print_compactors();
        kll.update_data_input(&DataInput::F64(40.0)).unwrap();
        // kll.print_compactors();
        kll.update_data_input(&DataInput::F64(50.0)).unwrap();
        // kll.print_compactors();

        // Query quantiles
        let cdf = kll.cdf();
        // cdf.print_entries();
        // kll.print_compactors();
        let median = cdf.query(0.5);
        // Median should be 30
        assert!(median == 30.0, "Median = {}", median);
    }

    #[test]
    fn merge_preserves_quantiles_within_tolerance() {
        const TOLERANCE: f64 = 0.02;
        const QUANTILES: &[(f64, &str)] = &[
            (0.0, "min"),
            (0.10, "p10"),
            (0.25, "p25"),
            (0.50, "p50"),
            (0.75, "p75"),
            (0.90, "p90"),
            (1.0, "max"),
        ];

        let values = sample_uniform_f64(1_000_000.0, 10_000_000.0, 10_000, 0xC0FFEE);
        let mut sketch_a = KLL::init_kll(SKETCH_K);
        let mut sketch_b = KLL::init_kll(SKETCH_K);

        for (idx, value) in values.iter().copied().enumerate() {
            if idx % 2 == 0 {
                sketch_a.update_data_input(&DataInput::F64(value)).unwrap();
            } else {
                sketch_b.update_data_input(&DataInput::F64(value)).unwrap();
            }
        }

        sketch_a.merge(&sketch_b);

        let mut sorted = values.clone();
        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
        assert_quantiles_within_error(
            &sketch_a,
            &sorted,
            QUANTILES,
            TOLERANCE,
            "merge",
            values.len(),
            0x00C0_FFEE,
        );
    }

    #[test]
    fn cdf_handles_empty_sketch() {
        let sketch = KLL::<f64>::init_kll(64);
        let cdf = sketch.cdf();
        assert_eq!(cdf.quantile(123.0), 0.0);
        assert_eq!(cdf.query(0.5), 0.0);
        assert_eq!(cdf.query_li(0.5), 0.0);
    }

    #[test]
    fn kll_round_trip_rmp() {
        let mut sketch = KLL::init_kll(256);
        let samples = sample_uniform_f64(0.0, 1_000_000.0, 5_000, 0xDEAD_BEEF);
        for value in &samples {
            sketch.update_data_input(&DataInput::F64(*value)).unwrap();
        }

        let bytes = sketch.serialize_to_bytes().expect("serialize KLL with rmp");
        assert!(!bytes.is_empty(), "serialized bytes should not be empty");

        let restored = KLL::deserialize_from_bytes(&bytes).expect("deserialize KLL with rmp");
        assert_eq!(sketch.k, restored.k);
        assert_eq!(sketch.m, restored.m);
        assert_eq!(sketch.num_levels, restored.num_levels);
        assert_eq!(sketch.top_height, restored.top_height);
        assert_eq!(sketch.level0_capacity, restored.level0_capacity);
        assert_eq!(
            sketch.levels, restored.levels,
            "level boundaries changed after round-trip"
        );

        let s_start = sketch.levels[0];
        let s_end = sketch.levels[sketch.num_levels];
        let r_start = restored.levels[0];
        let r_end = restored.levels[restored.num_levels];
        assert_eq!(
            &sketch.items[s_start..s_end],
            &restored.items[r_start..r_end],
            "packed items changed after round-trip"
        );

        let quantiles = [0.0, 0.1, 0.25, 0.5, 0.75, 0.9, 1.0];
        let original_cdf = sketch.cdf();
        let restored_cdf = restored.cdf();
        for &q in &quantiles {
            assert!(
                (original_cdf.query(q) - restored_cdf.query(q)).abs() < f64::EPSILON,
                "quantile mismatch at p={q}: original={}, restored={}",
                original_cdf.query(q),
                restored_cdf.query(q)
            );
        }
    }

    // Sanity pass for the generic KLL<T> specialization: exercise update/cdf/merge
    // on KLL<i64> to confirm the non-default T path compiles and produces sensible
    // quantiles. Tolerance is loose — this is a smoke test, not an accuracy bound.
    #[test]
    fn generic_kll_i64_sanity() {
        let mut sketch = KLL::<i64>::init_kll(200);
        let n: i64 = 20_000;
        for v in 1..=n {
            sketch.update(&v);
        }

        // Weighted count after random compactions is approximately n but not exact.
        let count = sketch.count() as f64;
        assert!(
            (count - n as f64).abs() / (n as f64) < 0.05,
            "count={count} diverged from n={n}"
        );

        let cdf = sketch.cdf();
        let p50 = cdf.query(0.5);
        let p90 = cdf.query(0.9);
        let tol = n as f64 * 0.02;
        assert!(
            (p50 - (n as f64 * 0.5)).abs() < tol,
            "p50={p50} out of range for n={n}"
        );
        assert!(
            (p90 - (n as f64 * 0.9)).abs() < tol,
            "p90={p90} out of range for n={n}"
        );

        // Merge between KLL<i64> sketches should work and preserve roughly the
        // same quantiles.
        let mut a = KLL::<i64>::init_kll(200);
        let mut b = KLL::<i64>::init_kll(200);
        for v in 1..=n {
            if v % 2 == 0 {
                a.update(&v);
            } else {
                b.update(&v);
            }
        }
        a.merge(&b);
        let merged_p50 = a.cdf().query(0.5);
        assert!(
            (merged_p50 - (n as f64 * 0.5)).abs() < tol,
            "merged p50={merged_p50} out of range"
        );

        // Serialization round-trip for the generic specialization.
        let bytes = a.serialize_to_bytes().expect("serialize KLL<i64>");
        let restored = KLL::<i64>::deserialize_from_bytes(&bytes).expect("deserialize KLL<i64>");
        assert_eq!(a.count(), restored.count());
    }
}