velesdb_core/collection/stats/
mod.rs1#![allow(clippy::cast_precision_loss)]
20#![allow(clippy::cast_possible_truncation)]
21
22use crate::collection::query_cost::cost_model::OperationCostFactors;
23use serde::{Deserialize, Serialize};
24use std::collections::HashMap;
25
26mod histogram;
27pub(crate) use histogram::next_after;
28pub(crate) use histogram::HistogramBuilder;
29pub use histogram::{Histogram, HistogramBucket};
30
31#[cfg(test)]
32mod tests;
33
34#[derive(Debug, Clone, Default, Serialize, Deserialize)]
36pub struct CollectionStats {
37 pub total_points: u64,
39 pub payload_size_bytes: u64,
41 pub field_stats: HashMap<String, ColumnStats>,
43 pub row_count: u64,
45 pub deleted_count: u64,
47 pub avg_row_size_bytes: u64,
49 pub total_size_bytes: u64,
51 pub column_stats: HashMap<String, ColumnStats>,
53 pub index_stats: HashMap<String, IndexStats>,
55 pub last_analyzed_epoch_ms: Option<u64>,
57 #[serde(default, skip_serializing_if = "Option::is_none")]
62 pub calibrated_cost_factors: Option<OperationCostFactors>,
63}
64
65impl CollectionStats {
66 #[must_use]
68 pub fn new() -> Self {
69 Self::default()
70 }
71
72 #[must_use]
74 pub fn with_counts(row_count: u64, deleted_count: u64) -> Self {
75 Self {
76 total_points: row_count,
77 row_count,
78 deleted_count,
79 ..Default::default()
80 }
81 }
82
83 #[must_use]
85 pub fn live_row_count(&self) -> u64 {
86 self.row_count.saturating_sub(self.deleted_count)
87 }
88
89 #[must_use]
91 pub fn deletion_ratio(&self) -> f64 {
92 if self.row_count == 0 {
93 0.0
94 } else {
95 self.deleted_count as f64 / self.row_count as f64
96 }
97 }
98
99 #[must_use]
101 pub fn estimate_selectivity(&self, column: &str) -> f64 {
102 if let Some(col_stats) = self.field_stats.get(column) {
103 if col_stats.distinct_values > 0 && self.total_points > 0 {
104 return 1.0 / col_stats.distinct_values as f64;
105 }
106 }
107 if let Some(col_stats) = self.column_stats.get(column) {
108 if col_stats.distinct_count > 0 && self.row_count > 0 {
109 return 1.0 / col_stats.distinct_count as f64;
110 }
111 }
112 0.1
114 }
115
116 #[must_use]
121 pub fn get_column_histogram(&self, column: &str) -> Option<&Histogram> {
122 self.column_stats
123 .get(column)
124 .or_else(|| self.field_stats.get(column))
125 .and_then(|cs| cs.histogram.as_ref())
126 .filter(|h| !h.buckets.is_empty())
127 }
128
129 pub fn mark_analyzed(&mut self) {
131 self.last_analyzed_epoch_ms = Some(
132 std::time::SystemTime::now()
133 .duration_since(std::time::UNIX_EPOCH)
134 .map_or(0, |d| d.as_millis() as u64),
135 );
136 }
137}
138
139#[derive(Debug, Clone, Default, Serialize, Deserialize)]
141pub struct ColumnStats {
142 pub name: String,
144 pub null_count: u64,
146 pub distinct_count: u64,
148 pub distinct_values: u64,
150 pub min_value: Option<String>,
152 pub max_value: Option<String>,
154 pub avg_size_bytes: u64,
156 pub histogram: Option<Histogram>,
158}
159
160impl ColumnStats {
161 #[must_use]
163 pub fn new(name: impl Into<String>) -> Self {
164 Self {
165 name: name.into(),
166 ..Default::default()
167 }
168 }
169
170 #[must_use]
172 pub fn with_distinct_count(mut self, count: u64) -> Self {
173 self.distinct_count = count;
174 self.distinct_values = count;
175 self
176 }
177
178 #[must_use]
180 pub fn with_null_count(mut self, count: u64) -> Self {
181 self.null_count = count;
182 self
183 }
184}
185
186#[derive(Debug, Clone, Default, Serialize, Deserialize)]
188pub struct IndexStats {
189 pub name: String,
191 pub index_type: String,
193 pub entry_count: u64,
195 pub depth: u32,
197 pub size_bytes: u64,
199}
200
201impl IndexStats {
202 #[must_use]
204 pub fn new(name: impl Into<String>, index_type: impl Into<String>) -> Self {
205 Self {
206 name: name.into(),
207 index_type: index_type.into(),
208 ..Default::default()
209 }
210 }
211
212 #[must_use]
214 pub fn with_entry_count(mut self, count: u64) -> Self {
215 self.entry_count = count;
216 self
217 }
218
219 #[must_use]
221 pub fn with_depth(mut self, depth: u32) -> Self {
222 self.depth = depth;
223 self
224 }
225}
226
227#[derive(Debug, Default)]
229pub struct StatsCollector {
230 stats: CollectionStats,
231}
232
233impl StatsCollector {
234 #[must_use]
236 pub fn new() -> Self {
237 Self::default()
238 }
239
240 pub fn set_row_count(&mut self, count: u64) {
242 self.stats.row_count = count;
243 self.stats.total_points = count;
244 }
245
246 pub fn set_deleted_count(&mut self, count: u64) {
248 self.stats.deleted_count = count;
249 }
250
251 pub fn set_total_size(&mut self, size: u64) {
253 self.stats.total_size_bytes = size;
254 self.stats.payload_size_bytes = size;
255 }
256
257 pub fn add_column_stats(&mut self, stats: ColumnStats) {
259 self.stats
260 .column_stats
261 .insert(stats.name.clone(), stats.clone());
262 self.stats.field_stats.insert(stats.name.clone(), stats);
263 }
264
265 pub fn add_index_stats(&mut self, stats: IndexStats) {
267 self.stats.index_stats.insert(stats.name.clone(), stats);
268 }
269
270 pub fn build_histogram(&mut self, column_name: &str, values: &mut [f64], num_buckets: usize) {
276 let histogram = HistogramBuilder::new(num_buckets).build(values);
277 self.stats
278 .column_stats
279 .entry(column_name.to_owned())
280 .or_insert_with(|| ColumnStats::new(column_name))
281 .histogram = Some(histogram.clone());
282 self.stats
283 .field_stats
284 .entry(column_name.to_owned())
285 .or_insert_with(|| ColumnStats::new(column_name))
286 .histogram = Some(histogram);
287 }
288
289 #[must_use]
291 pub fn build(mut self) -> CollectionStats {
292 if let Some(avg) = self
294 .stats
295 .total_size_bytes
296 .checked_div(self.stats.row_count)
297 {
298 self.stats.avg_row_size_bytes = avg;
299 }
300
301 self.stats.mark_analyzed();
302 self.stats
303 }
304}