Skip to main content

velesdb_core/collection/stats/
mod.rs

1//! Collection statistics module for query planning.
2//!
3//! This module provides statistics collection and caching for collections,
4//! enabling cost-based query planning and optimization.
5//!
6//! # EPIC-046 US-001: Collection Statistics
7//!
8//! Implements collection-level statistics including:
9//! - Row count and deleted count
10//! - Column cardinality (distinct values)
11//! - Index statistics (depth, entry count)
12//! - Size metrics (avg row size, total size)
13
14// SAFETY: Numeric casts in statistics are intentional:
15// - All casts are for computing collection metrics and estimates
16// - f64/usize conversions for cardinality ratios and averages
17// - Values bounded by collection size and column cardinality
18// - Precision loss acceptable for statistics (approximate by design)
19#![allow(clippy::cast_precision_loss)]
20#![allow(clippy::cast_possible_truncation)]
21
22use serde::{Deserialize, Serialize};
23use std::collections::HashMap;
24
25#[cfg(test)]
26mod tests;
27
28/// Statistics for a collection.
29#[derive(Debug, Clone, Default, Serialize, Deserialize)]
30pub struct CollectionStats {
31    /// Total number of points in the collection.
32    pub total_points: u64,
33    /// Total payload storage footprint in bytes.
34    pub payload_size_bytes: u64,
35    /// Per-field statistics for cost-based planning.
36    pub field_stats: HashMap<String, ColumnStats>,
37    /// Number of active rows
38    pub row_count: u64,
39    /// Number of deleted/tombstoned rows
40    pub deleted_count: u64,
41    /// Average row size in bytes
42    pub avg_row_size_bytes: u64,
43    /// Total collection size in bytes
44    pub total_size_bytes: u64,
45    /// Statistics per column
46    pub column_stats: HashMap<String, ColumnStats>,
47    /// Statistics per index
48    pub index_stats: HashMap<String, IndexStats>,
49    /// Timestamp of last ANALYZE
50    pub last_analyzed_epoch_ms: Option<u64>,
51}
52
53impl CollectionStats {
54    /// Creates empty statistics
55    #[must_use]
56    pub fn new() -> Self {
57        Self::default()
58    }
59
60    /// Creates statistics with basic counts
61    #[must_use]
62    pub fn with_counts(row_count: u64, deleted_count: u64) -> Self {
63        Self {
64            total_points: row_count,
65            row_count,
66            deleted_count,
67            ..Default::default()
68        }
69    }
70
71    /// Returns the live row count (excluding deleted)
72    #[must_use]
73    pub fn live_row_count(&self) -> u64 {
74        self.row_count.saturating_sub(self.deleted_count)
75    }
76
77    /// Returns the deletion ratio (0.0-1.0)
78    #[must_use]
79    pub fn deletion_ratio(&self) -> f64 {
80        if self.row_count == 0 {
81            0.0
82        } else {
83            self.deleted_count as f64 / self.row_count as f64
84        }
85    }
86
87    /// Estimates selectivity for a column based on cardinality
88    #[must_use]
89    pub fn estimate_selectivity(&self, column: &str) -> f64 {
90        if let Some(col_stats) = self.field_stats.get(column) {
91            if col_stats.distinct_values > 0 && self.total_points > 0 {
92                return 1.0 / col_stats.distinct_values as f64;
93            }
94        }
95        if let Some(col_stats) = self.column_stats.get(column) {
96            if col_stats.distinct_count > 0 && self.row_count > 0 {
97                return 1.0 / col_stats.distinct_count as f64;
98            }
99        }
100        // Default: assume 10% selectivity if unknown
101        0.1
102    }
103
104    /// Sets the last analyzed timestamp to now
105    pub fn mark_analyzed(&mut self) {
106        self.last_analyzed_epoch_ms = Some(
107            std::time::SystemTime::now()
108                .duration_since(std::time::UNIX_EPOCH)
109                .map(|d| d.as_millis() as u64)
110                .unwrap_or(0),
111        );
112    }
113}
114
115/// Statistics for a single column.
116#[derive(Debug, Clone, Default, Serialize, Deserialize)]
117pub struct ColumnStats {
118    /// Column name
119    pub name: String,
120    /// Number of null values
121    pub null_count: u64,
122    /// Number of distinct values (cardinality)
123    pub distinct_count: u64,
124    /// Number of distinct values (CBO alias).
125    pub distinct_values: u64,
126    /// Minimum value (serialized)
127    pub min_value: Option<String>,
128    /// Maximum value (serialized)
129    pub max_value: Option<String>,
130    /// Average value size in bytes
131    pub avg_size_bytes: u64,
132    /// Optional histogram for selectivity estimates.
133    pub histogram: Option<Histogram>,
134}
135
136/// Histogram bucket storing approximate distribution details.
137#[derive(Debug, Clone, Default, Serialize, Deserialize)]
138pub struct HistogramBucket {
139    /// Inclusive lower bound for the bucket.
140    pub lower_bound: f64,
141    /// Exclusive upper bound for the bucket.
142    pub upper_bound: f64,
143    /// Number of sampled rows in the bucket.
144    pub count: u64,
145}
146
147/// Simple histogram used by the CBO.
148#[derive(Debug, Clone, Default, Serialize, Deserialize)]
149pub struct Histogram {
150    /// Ordered list of histogram buckets.
151    pub buckets: Vec<HistogramBucket>,
152}
153
154impl ColumnStats {
155    /// Creates new column stats
156    #[must_use]
157    pub fn new(name: impl Into<String>) -> Self {
158        Self {
159            name: name.into(),
160            ..Default::default()
161        }
162    }
163
164    /// Sets cardinality
165    #[must_use]
166    pub fn with_distinct_count(mut self, count: u64) -> Self {
167        self.distinct_count = count;
168        self.distinct_values = count;
169        self
170    }
171
172    /// Sets null count
173    #[must_use]
174    pub fn with_null_count(mut self, count: u64) -> Self {
175        self.null_count = count;
176        self
177    }
178}
179
180/// Statistics for an index.
181#[derive(Debug, Clone, Default, Serialize, Deserialize)]
182pub struct IndexStats {
183    /// Index name
184    pub name: String,
185    /// Index type (HNSW, PropertyIndex, etc.)
186    pub index_type: String,
187    /// Number of entries in the index
188    pub entry_count: u64,
189    /// Index depth (for tree-based indexes)
190    pub depth: u32,
191    /// Index size in bytes
192    pub size_bytes: u64,
193}
194
195impl IndexStats {
196    /// Creates new index stats
197    #[must_use]
198    pub fn new(name: impl Into<String>, index_type: impl Into<String>) -> Self {
199        Self {
200            name: name.into(),
201            index_type: index_type.into(),
202            ..Default::default()
203        }
204    }
205
206    /// Sets entry count
207    #[must_use]
208    pub fn with_entry_count(mut self, count: u64) -> Self {
209        self.entry_count = count;
210        self
211    }
212
213    /// Sets depth
214    #[must_use]
215    pub fn with_depth(mut self, depth: u32) -> Self {
216        self.depth = depth;
217        self
218    }
219}
220
221/// Statistics collector for building CollectionStats.
222#[derive(Debug, Default)]
223pub struct StatsCollector {
224    stats: CollectionStats,
225}
226
227impl StatsCollector {
228    /// Creates a new collector
229    #[must_use]
230    pub fn new() -> Self {
231        Self::default()
232    }
233
234    /// Sets row count
235    pub fn set_row_count(&mut self, count: u64) {
236        self.stats.row_count = count;
237        self.stats.total_points = count;
238    }
239
240    /// Sets deleted count
241    pub fn set_deleted_count(&mut self, count: u64) {
242        self.stats.deleted_count = count;
243    }
244
245    /// Sets total size
246    pub fn set_total_size(&mut self, size: u64) {
247        self.stats.total_size_bytes = size;
248        self.stats.payload_size_bytes = size;
249    }
250
251    /// Adds column statistics
252    pub fn add_column_stats(&mut self, stats: ColumnStats) {
253        self.stats
254            .column_stats
255            .insert(stats.name.clone(), stats.clone());
256        self.stats.field_stats.insert(stats.name.clone(), stats);
257    }
258
259    /// Adds index statistics
260    pub fn add_index_stats(&mut self, stats: IndexStats) {
261        self.stats.index_stats.insert(stats.name.clone(), stats);
262    }
263
264    /// Builds the final CollectionStats
265    #[must_use]
266    pub fn build(mut self) -> CollectionStats {
267        // Calculate average row size
268        if self.stats.row_count > 0 {
269            self.stats.avg_row_size_bytes = self.stats.total_size_bytes / self.stats.row_count;
270        }
271
272        self.stats.mark_analyzed();
273        self.stats
274    }
275}