velesdb_core/collection/core/
statistics.rs1#![allow(
2 clippy::cast_precision_loss,
3 clippy::cast_possible_truncation,
4 clippy::cast_sign_loss,
5 clippy::cast_possible_wrap,
6 clippy::float_cmp,
7 clippy::approx_constant
8)]
9use crate::collection::stats::{CollectionStats, IndexStats, StatsCollector};
15use crate::collection::Collection;
16use crate::error::Error;
17use crate::storage::PayloadStorage;
18use std::collections::{HashMap, HashSet};
19
20impl Collection {
21 pub fn analyze(&self) -> Result<CollectionStats, Error> {
43 let mut collector = StatsCollector::new();
44
45 let config = self.config.read();
49 collector.set_row_count(u64::try_from(config.point_count).unwrap_or(u64::MAX));
51 drop(config);
52
53 let mut distinct_values: HashMap<String, HashSet<String>> = HashMap::new();
54 let mut null_counts: HashMap<String, u64> = HashMap::new();
55 let mut payload_size_bytes = 0u64;
56
57 let payload_storage = self.payload_storage.read();
58 let ids = payload_storage.ids();
59 for id in ids.into_iter().take(1_000) {
60 if let Ok(Some(payload)) = payload_storage.retrieve(id) {
61 if let Ok(payload_bytes) = serde_json::to_vec(&payload) {
62 payload_size_bytes = payload_size_bytes
63 .saturating_add(u64::try_from(payload_bytes.len()).unwrap_or(u64::MAX));
64 }
65
66 if let Some(obj) = payload.as_object() {
67 for (key, value) in obj {
68 if value.is_null() {
69 *null_counts.entry(key.clone()).or_insert(0) += 1;
70 } else {
71 distinct_values
72 .entry(key.clone())
73 .or_default()
74 .insert(value.to_string());
75 }
76 }
77 }
78 }
79 }
80 drop(payload_storage);
81
82 collector.set_total_size(payload_size_bytes);
83
84 for (field, values) in distinct_values {
85 let mut col = crate::collection::stats::ColumnStats::new(field.clone())
86 .with_distinct_count(u64::try_from(values.len()).unwrap_or(u64::MAX));
87 if let Some(null_count) = null_counts.get(&field) {
88 col = col.with_null_count(*null_count);
89 }
90 collector.add_column_stats(col);
91 }
92
93 let hnsw_len = self.index.len();
95 let hnsw_stats = IndexStats::new("hnsw_primary", "HNSW")
96 .with_entry_count(u64::try_from(hnsw_len).unwrap_or(u64::MAX));
97 collector.add_index_stats(hnsw_stats);
98
99 let bm25_len = self.text_index.len();
101 if bm25_len > 0 {
102 let bm25_stats = IndexStats::new("bm25_text", "BM25")
103 .with_entry_count(u64::try_from(bm25_len).unwrap_or(u64::MAX));
104 collector.add_index_stats(bm25_stats);
105 }
106
107 Ok(collector.build())
108 }
109
110 #[must_use]
119 pub fn get_stats(&self) -> CollectionStats {
120 match self.analyze() {
124 Ok(stats) => stats,
125 Err(e) => {
126 tracing::warn!(
127 "Failed to compute collection statistics: {}. Returning defaults.",
128 e
129 );
130 CollectionStats::default()
131 }
132 }
133 }
134
135 #[must_use]
140 pub fn estimate_column_selectivity(&self, column: &str) -> f64 {
141 let stats = self.get_stats();
142 stats.estimate_selectivity(column)
143 }
144}
145
146#[cfg(test)]
147mod tests {
148 use super::*;
149 use crate::distance::DistanceMetric;
150 use tempfile::TempDir;
151
152 #[test]
153 fn test_analyze_empty_collection() {
154 let temp_dir = TempDir::new().unwrap();
155 let collection =
156 Collection::create(temp_dir.path().to_path_buf(), 128, DistanceMetric::Cosine).unwrap();
157
158 let stats = collection.analyze().unwrap();
159
160 assert_eq!(stats.row_count, 0);
161 assert_eq!(stats.deleted_count, 0);
162 assert!(stats.index_stats.contains_key("hnsw_primary"));
163 }
164
165 #[test]
166 fn test_analyze_with_data() {
167 use crate::point::Point;
168
169 let temp_dir = TempDir::new().unwrap();
170 let collection =
171 Collection::create(temp_dir.path().to_path_buf(), 4, DistanceMetric::Cosine).unwrap();
172
173 let points: Vec<Point> = (0..10)
175 .map(|i| {
176 Point::new(
177 i,
178 vec![i as f32; 4],
179 Some(serde_json::json!({"category": format!("cat_{}", i % 3)})),
180 )
181 })
182 .collect();
183 collection.upsert(points).unwrap();
184
185 let stats = collection.analyze().unwrap();
186
187 assert_eq!(stats.row_count, 10);
188 assert!(stats.index_stats.get("hnsw_primary").unwrap().entry_count >= 10);
189 }
190
191 #[test]
192 fn test_get_stats_returns_defaults_on_error() {
193 let temp_dir = TempDir::new().unwrap();
194 let collection =
195 Collection::create(temp_dir.path().to_path_buf(), 128, DistanceMetric::Cosine).unwrap();
196
197 let stats = collection.get_stats();
198
199 assert_eq!(stats.live_row_count(), 0);
201 }
202}