Skip to main content

velesdb_core/collection/stats/
mod.rs

1//! Collection statistics module for query planning.
2//!
3//! This module provides statistics collection and caching for collections,
4//! enabling cost-based query planning and optimization.
5//!
6//! # EPIC-046 US-001: Collection Statistics
7//!
8//! Implements collection-level statistics including:
9//! - Row count and deleted count
10//! - Column cardinality (distinct values)
11//! - Index statistics (depth, entry count)
12//! - Size metrics (avg row size, total size)
13
14// Reason: Numeric casts in statistics are intentional:
15// - All casts are for computing collection metrics and estimates
16// - f64/usize conversions for cardinality ratios and averages
17// - Values bounded by collection size and column cardinality
18// - Precision loss acceptable for statistics (approximate by design)
19#![allow(clippy::cast_precision_loss)]
20#![allow(clippy::cast_possible_truncation)]
21
22use crate::collection::query_cost::cost_model::OperationCostFactors;
23use serde::{Deserialize, Serialize};
24use std::collections::HashMap;
25
26mod histogram;
27pub(crate) use histogram::next_after;
28pub(crate) use histogram::HistogramBuilder;
29pub use histogram::{Histogram, HistogramBucket};
30
31#[cfg(test)]
32mod tests;
33
34/// Statistics for a collection.
35#[derive(Debug, Clone, Default, Serialize, Deserialize)]
36pub struct CollectionStats {
37    /// Total number of points in the collection.
38    pub total_points: u64,
39    /// Total payload storage footprint in bytes.
40    pub payload_size_bytes: u64,
41    /// Per-field statistics for cost-based planning.
42    pub field_stats: HashMap<String, ColumnStats>,
43    /// Number of active rows
44    pub row_count: u64,
45    /// Number of deleted/tombstoned rows
46    pub deleted_count: u64,
47    /// Average row size in bytes
48    pub avg_row_size_bytes: u64,
49    /// Total collection size in bytes
50    pub total_size_bytes: u64,
51    /// Statistics per column
52    pub column_stats: HashMap<String, ColumnStats>,
53    /// Statistics per index
54    pub index_stats: HashMap<String, IndexStats>,
55    /// Timestamp of last ANALYZE
56    pub last_analyzed_epoch_ms: Option<u64>,
57    /// Calibrated cost factors derived from collection statistics.
58    ///
59    /// `None` if the collection has never been analyzed or stats are invalid.
60    /// Persisted in `collection.stats.json` to survive restarts.
61    #[serde(default, skip_serializing_if = "Option::is_none")]
62    pub calibrated_cost_factors: Option<OperationCostFactors>,
63}
64
65impl CollectionStats {
66    /// Creates empty statistics
67    #[must_use]
68    pub fn new() -> Self {
69        Self::default()
70    }
71
72    /// Creates statistics with basic counts
73    #[must_use]
74    pub fn with_counts(row_count: u64, deleted_count: u64) -> Self {
75        Self {
76            total_points: row_count,
77            row_count,
78            deleted_count,
79            ..Default::default()
80        }
81    }
82
83    /// Returns the live row count (excluding deleted)
84    #[must_use]
85    pub fn live_row_count(&self) -> u64 {
86        self.row_count.saturating_sub(self.deleted_count)
87    }
88
89    /// Returns the deletion ratio (0.0-1.0)
90    #[must_use]
91    pub fn deletion_ratio(&self) -> f64 {
92        if self.row_count == 0 {
93            0.0
94        } else {
95            self.deleted_count as f64 / self.row_count as f64
96        }
97    }
98
99    /// Estimates selectivity for a column based on cardinality
100    #[must_use]
101    pub fn estimate_selectivity(&self, column: &str) -> f64 {
102        if let Some(col_stats) = self.field_stats.get(column) {
103            if col_stats.distinct_values > 0 && self.total_points > 0 {
104                return 1.0 / col_stats.distinct_values as f64;
105            }
106        }
107        if let Some(col_stats) = self.column_stats.get(column) {
108            if col_stats.distinct_count > 0 && self.row_count > 0 {
109                return 1.0 / col_stats.distinct_count as f64;
110            }
111        }
112        // Default: assume 10% selectivity if unknown
113        0.1
114    }
115
116    /// Returns the histogram for a column, checking both `column_stats` and `field_stats`.
117    ///
118    /// Returns `None` when neither map contains the column or the histogram is
119    /// absent / empty.
120    #[must_use]
121    pub fn get_column_histogram(&self, column: &str) -> Option<&Histogram> {
122        self.column_stats
123            .get(column)
124            .or_else(|| self.field_stats.get(column))
125            .and_then(|cs| cs.histogram.as_ref())
126            .filter(|h| !h.buckets.is_empty())
127    }
128
129    /// Sets the last analyzed timestamp to now
130    pub fn mark_analyzed(&mut self) {
131        self.last_analyzed_epoch_ms = Some(
132            std::time::SystemTime::now()
133                .duration_since(std::time::UNIX_EPOCH)
134                .map_or(0, |d| d.as_millis() as u64),
135        );
136    }
137}
138
139/// Statistics for a single column.
140#[derive(Debug, Clone, Default, Serialize, Deserialize)]
141pub struct ColumnStats {
142    /// Column name
143    pub name: String,
144    /// Number of null values
145    pub null_count: u64,
146    /// Number of distinct values (cardinality)
147    pub distinct_count: u64,
148    /// Number of distinct values (CBO alias).
149    pub distinct_values: u64,
150    /// Minimum value (serialized)
151    pub min_value: Option<String>,
152    /// Maximum value (serialized)
153    pub max_value: Option<String>,
154    /// Average value size in bytes
155    pub avg_size_bytes: u64,
156    /// Optional histogram for selectivity estimates.
157    pub histogram: Option<Histogram>,
158}
159
160impl ColumnStats {
161    /// Creates new column stats
162    #[must_use]
163    pub fn new(name: impl Into<String>) -> Self {
164        Self {
165            name: name.into(),
166            ..Default::default()
167        }
168    }
169
170    /// Sets cardinality
171    #[must_use]
172    pub fn with_distinct_count(mut self, count: u64) -> Self {
173        self.distinct_count = count;
174        self.distinct_values = count;
175        self
176    }
177
178    /// Sets null count
179    #[must_use]
180    pub fn with_null_count(mut self, count: u64) -> Self {
181        self.null_count = count;
182        self
183    }
184}
185
186/// Statistics for an index.
187#[derive(Debug, Clone, Default, Serialize, Deserialize)]
188pub struct IndexStats {
189    /// Index name
190    pub name: String,
191    /// Index type (HNSW, PropertyIndex, etc.)
192    pub index_type: String,
193    /// Number of entries in the index
194    pub entry_count: u64,
195    /// Index depth (for tree-based indexes)
196    pub depth: u32,
197    /// Index size in bytes
198    pub size_bytes: u64,
199}
200
201impl IndexStats {
202    /// Creates new index stats
203    #[must_use]
204    pub fn new(name: impl Into<String>, index_type: impl Into<String>) -> Self {
205        Self {
206            name: name.into(),
207            index_type: index_type.into(),
208            ..Default::default()
209        }
210    }
211
212    /// Sets entry count
213    #[must_use]
214    pub fn with_entry_count(mut self, count: u64) -> Self {
215        self.entry_count = count;
216        self
217    }
218
219    /// Sets depth
220    #[must_use]
221    pub fn with_depth(mut self, depth: u32) -> Self {
222        self.depth = depth;
223        self
224    }
225}
226
227/// Statistics collector for building CollectionStats.
228#[derive(Debug, Default)]
229pub struct StatsCollector {
230    stats: CollectionStats,
231}
232
233impl StatsCollector {
234    /// Creates a new collector
235    #[must_use]
236    pub fn new() -> Self {
237        Self::default()
238    }
239
240    /// Sets row count
241    pub fn set_row_count(&mut self, count: u64) {
242        self.stats.row_count = count;
243        self.stats.total_points = count;
244    }
245
246    /// Sets deleted count
247    pub fn set_deleted_count(&mut self, count: u64) {
248        self.stats.deleted_count = count;
249    }
250
251    /// Sets total size
252    pub fn set_total_size(&mut self, size: u64) {
253        self.stats.total_size_bytes = size;
254        self.stats.payload_size_bytes = size;
255    }
256
257    /// Adds column statistics
258    pub fn add_column_stats(&mut self, stats: ColumnStats) {
259        self.stats
260            .column_stats
261            .insert(stats.name.clone(), stats.clone());
262        self.stats.field_stats.insert(stats.name.clone(), stats);
263    }
264
265    /// Adds index statistics
266    pub fn add_index_stats(&mut self, stats: IndexStats) {
267        self.stats.index_stats.insert(stats.name.clone(), stats);
268    }
269
270    /// Builds a histogram for a column from sampled values and stores it.
271    ///
272    /// Called by `Collection::analyze()` for each Int, Float, and String column.
273    /// Uses `HistogramBuilder` to construct an equi-depth histogram, then attaches
274    /// it to the corresponding `ColumnStats` entry (creating one if absent).
275    pub fn build_histogram(&mut self, column_name: &str, values: &mut [f64], num_buckets: usize) {
276        let histogram = HistogramBuilder::new(num_buckets).build(values);
277        self.stats
278            .column_stats
279            .entry(column_name.to_owned())
280            .or_insert_with(|| ColumnStats::new(column_name))
281            .histogram = Some(histogram.clone());
282        self.stats
283            .field_stats
284            .entry(column_name.to_owned())
285            .or_insert_with(|| ColumnStats::new(column_name))
286            .histogram = Some(histogram);
287    }
288
289    /// Builds the final CollectionStats
290    #[must_use]
291    pub fn build(mut self) -> CollectionStats {
292        // Calculate average row size
293        if let Some(avg) = self
294            .stats
295            .total_size_bytes
296            .checked_div(self.stats.row_count)
297        {
298            self.stats.avg_row_size_bytes = avg;
299        }
300
301        self.stats.mark_analyzed();
302        self.stats
303    }
304}