#![allow(clippy::cast_precision_loss)]
#![allow(clippy::cast_possible_truncation)]
use crate::collection::query_cost::cost_model::OperationCostFactors;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
mod histogram;
pub(crate) use histogram::next_after;
pub(crate) use histogram::HistogramBuilder;
pub use histogram::{Histogram, HistogramBucket};
#[cfg(test)]
mod tests;
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct CollectionStats {
pub total_points: u64,
pub payload_size_bytes: u64,
pub field_stats: HashMap<String, ColumnStats>,
pub row_count: u64,
pub deleted_count: u64,
pub avg_row_size_bytes: u64,
pub total_size_bytes: u64,
pub column_stats: HashMap<String, ColumnStats>,
pub index_stats: HashMap<String, IndexStats>,
pub last_analyzed_epoch_ms: Option<u64>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub calibrated_cost_factors: Option<OperationCostFactors>,
}
impl CollectionStats {
#[must_use]
pub fn new() -> Self {
Self::default()
}
#[must_use]
pub fn with_counts(row_count: u64, deleted_count: u64) -> Self {
Self {
total_points: row_count,
row_count,
deleted_count,
..Default::default()
}
}
#[must_use]
pub fn live_row_count(&self) -> u64 {
self.row_count.saturating_sub(self.deleted_count)
}
#[must_use]
pub fn deletion_ratio(&self) -> f64 {
if self.row_count == 0 {
0.0
} else {
self.deleted_count as f64 / self.row_count as f64
}
}
#[must_use]
pub fn estimate_selectivity(&self, column: &str) -> f64 {
if let Some(col_stats) = self.field_stats.get(column) {
if col_stats.distinct_values > 0 && self.total_points > 0 {
return 1.0 / col_stats.distinct_values as f64;
}
}
if let Some(col_stats) = self.column_stats.get(column) {
if col_stats.distinct_count > 0 && self.row_count > 0 {
return 1.0 / col_stats.distinct_count as f64;
}
}
0.1
}
#[must_use]
pub fn get_column_histogram(&self, column: &str) -> Option<&Histogram> {
self.column_stats
.get(column)
.or_else(|| self.field_stats.get(column))
.and_then(|cs| cs.histogram.as_ref())
.filter(|h| !h.buckets.is_empty())
}
pub fn mark_analyzed(&mut self) {
self.last_analyzed_epoch_ms = Some(
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map_or(0, |d| d.as_millis() as u64),
);
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct ColumnStats {
pub name: String,
pub null_count: u64,
pub distinct_count: u64,
pub distinct_values: u64,
pub min_value: Option<String>,
pub max_value: Option<String>,
pub avg_size_bytes: u64,
pub histogram: Option<Histogram>,
}
impl ColumnStats {
#[must_use]
pub fn new(name: impl Into<String>) -> Self {
Self {
name: name.into(),
..Default::default()
}
}
#[must_use]
pub fn with_distinct_count(mut self, count: u64) -> Self {
self.distinct_count = count;
self.distinct_values = count;
self
}
#[must_use]
pub fn with_null_count(mut self, count: u64) -> Self {
self.null_count = count;
self
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct IndexStats {
pub name: String,
pub index_type: String,
pub entry_count: u64,
pub depth: u32,
pub size_bytes: u64,
}
impl IndexStats {
#[must_use]
pub fn new(name: impl Into<String>, index_type: impl Into<String>) -> Self {
Self {
name: name.into(),
index_type: index_type.into(),
..Default::default()
}
}
#[must_use]
pub fn with_entry_count(mut self, count: u64) -> Self {
self.entry_count = count;
self
}
#[must_use]
pub fn with_depth(mut self, depth: u32) -> Self {
self.depth = depth;
self
}
}
#[derive(Debug, Default)]
pub struct StatsCollector {
stats: CollectionStats,
}
impl StatsCollector {
#[must_use]
pub fn new() -> Self {
Self::default()
}
pub fn set_row_count(&mut self, count: u64) {
self.stats.row_count = count;
self.stats.total_points = count;
}
pub fn set_deleted_count(&mut self, count: u64) {
self.stats.deleted_count = count;
}
pub fn set_total_size(&mut self, size: u64) {
self.stats.total_size_bytes = size;
self.stats.payload_size_bytes = size;
}
pub fn add_column_stats(&mut self, stats: ColumnStats) {
self.stats
.column_stats
.insert(stats.name.clone(), stats.clone());
self.stats.field_stats.insert(stats.name.clone(), stats);
}
pub fn add_index_stats(&mut self, stats: IndexStats) {
self.stats.index_stats.insert(stats.name.clone(), stats);
}
pub fn build_histogram(&mut self, column_name: &str, values: &mut [f64], num_buckets: usize) {
let histogram = HistogramBuilder::new(num_buckets).build(values);
self.stats
.column_stats
.entry(column_name.to_owned())
.or_insert_with(|| ColumnStats::new(column_name))
.histogram = Some(histogram.clone());
self.stats
.field_stats
.entry(column_name.to_owned())
.or_insert_with(|| ColumnStats::new(column_name))
.histogram = Some(histogram);
}
#[must_use]
pub fn build(mut self) -> CollectionStats {
if let Some(avg) = self
.stats
.total_size_bytes
.checked_div(self.stats.row_count)
{
self.stats.avg_row_size_bytes = avg;
}
self.stats.mark_analyzed();
self.stats
}
}