Skip to main content

velesdb_core/collection/core/
statistics.rs

1#![allow(
2    clippy::cast_precision_loss,
3    clippy::cast_possible_truncation,
4    clippy::cast_sign_loss,
5    clippy::cast_possible_wrap,
6    clippy::float_cmp,
7    clippy::approx_constant
8)]
9//! Collection statistics methods (EPIC-046 US-001).
10//!
11//! Provides the `analyze()` method for collecting runtime statistics
12//! to support cost-based query planning.
13
14use crate::collection::stats::{CollectionStats, IndexStats, StatsCollector};
15use crate::collection::Collection;
16use crate::error::Error;
17use crate::storage::PayloadStorage;
18use std::collections::{HashMap, HashSet};
19
20impl Collection {
21    /// Analyzes the collection and returns statistics.
22    ///
23    /// This method collects:
24    /// - Row count and deleted count
25    /// - Index statistics (HNSW entry count)
26    ///
27    /// # Example
28    ///
29    /// ```ignore
30    /// let stats = collection.analyze()?;
31    /// println!("Row count: {}", stats.row_count);
32    /// println!("Deletion ratio: {:.1}%", stats.deletion_ratio() * 100.0);
33    /// ```
34    ///
35    /// # Errors
36    ///
37    /// Returns an error if statistics cannot be collected.
38    ///
39    /// # Panics
40    ///
41    /// Panics if `point_count` exceeds `u64::MAX` (extremely unlikely on 64-bit systems).
42    pub fn analyze(&self) -> Result<CollectionStats, Error> {
43        let mut collector = StatsCollector::new();
44
45        // Basic counts from config
46        // Note: deleted_count and column_stats are placeholders for future tombstone tracking
47        // and per-column cardinality analysis (EPIC-046 future work)
48        let config = self.config.read();
49        // Reason: Collection sizes are bounded by available memory, always < u64::MAX on 64-bit systems
50        collector.set_row_count(u64::try_from(config.point_count).unwrap_or(u64::MAX));
51        drop(config);
52
53        let mut distinct_values: HashMap<String, HashSet<String>> = HashMap::new();
54        let mut null_counts: HashMap<String, u64> = HashMap::new();
55        let mut payload_size_bytes = 0u64;
56
57        let payload_storage = self.payload_storage.read();
58        let ids = payload_storage.ids();
59        for id in ids.into_iter().take(1_000) {
60            if let Ok(Some(payload)) = payload_storage.retrieve(id) {
61                if let Ok(payload_bytes) = serde_json::to_vec(&payload) {
62                    payload_size_bytes = payload_size_bytes
63                        .saturating_add(u64::try_from(payload_bytes.len()).unwrap_or(u64::MAX));
64                }
65
66                if let Some(obj) = payload.as_object() {
67                    for (key, value) in obj {
68                        if value.is_null() {
69                            *null_counts.entry(key.clone()).or_insert(0) += 1;
70                        } else {
71                            distinct_values
72                                .entry(key.clone())
73                                .or_default()
74                                .insert(value.to_string());
75                        }
76                    }
77                }
78            }
79        }
80        drop(payload_storage);
81
82        collector.set_total_size(payload_size_bytes);
83
84        for (field, values) in distinct_values {
85            let mut col = crate::collection::stats::ColumnStats::new(field.clone())
86                .with_distinct_count(u64::try_from(values.len()).unwrap_or(u64::MAX));
87            if let Some(null_count) = null_counts.get(&field) {
88                col = col.with_null_count(*null_count);
89            }
90            collector.add_column_stats(col);
91        }
92
93        // HNSW index statistics
94        let hnsw_len = self.index.len();
95        let hnsw_stats = IndexStats::new("hnsw_primary", "HNSW")
96            .with_entry_count(u64::try_from(hnsw_len).unwrap_or(u64::MAX));
97        collector.add_index_stats(hnsw_stats);
98
99        // BM25 index statistics - use len() if available
100        let bm25_len = self.text_index.len();
101        if bm25_len > 0 {
102            let bm25_stats = IndexStats::new("bm25_text", "BM25")
103                .with_entry_count(u64::try_from(bm25_len).unwrap_or(u64::MAX));
104            collector.add_index_stats(bm25_stats);
105        }
106
107        Ok(collector.build())
108    }
109
110    /// Returns cached statistics if available, or computes them.
111    ///
112    /// This is a convenience method that avoids recomputing statistics
113    /// if they were recently computed. For fresh statistics, use `analyze()`.
114    ///
115    /// # Note
116    /// Returns default stats on error (intentional for convenience).
117    /// Use `analyze()` directly if error handling is required.
118    #[must_use]
119    pub fn get_stats(&self) -> CollectionStats {
120        // For now, always compute fresh stats
121        // Future: implement caching with TTL
122        // Design: returns default on error for convenience (caller can use analyze() for errors)
123        match self.analyze() {
124            Ok(stats) => stats,
125            Err(e) => {
126                tracing::warn!(
127                    "Failed to compute collection statistics: {}. Returning defaults.",
128                    e
129                );
130                CollectionStats::default()
131            }
132        }
133    }
134
135    /// Returns the selectivity estimate for a column.
136    ///
137    /// Selectivity is 1/cardinality, representing the probability
138    /// that a random row matches a specific value.
139    #[must_use]
140    pub fn estimate_column_selectivity(&self, column: &str) -> f64 {
141        let stats = self.get_stats();
142        stats.estimate_selectivity(column)
143    }
144}
145
146#[cfg(test)]
147mod tests {
148    use super::*;
149    use crate::distance::DistanceMetric;
150    use tempfile::TempDir;
151
152    #[test]
153    fn test_analyze_empty_collection() {
154        let temp_dir = TempDir::new().unwrap();
155        let collection =
156            Collection::create(temp_dir.path().to_path_buf(), 128, DistanceMetric::Cosine).unwrap();
157
158        let stats = collection.analyze().unwrap();
159
160        assert_eq!(stats.row_count, 0);
161        assert_eq!(stats.deleted_count, 0);
162        assert!(stats.index_stats.contains_key("hnsw_primary"));
163    }
164
165    #[test]
166    fn test_analyze_with_data() {
167        use crate::point::Point;
168
169        let temp_dir = TempDir::new().unwrap();
170        let collection =
171            Collection::create(temp_dir.path().to_path_buf(), 4, DistanceMetric::Cosine).unwrap();
172
173        // Insert some vectors using Point
174        let points: Vec<Point> = (0..10)
175            .map(|i| {
176                Point::new(
177                    i,
178                    vec![i as f32; 4],
179                    Some(serde_json::json!({"category": format!("cat_{}", i % 3)})),
180                )
181            })
182            .collect();
183        collection.upsert(points).unwrap();
184
185        let stats = collection.analyze().unwrap();
186
187        assert_eq!(stats.row_count, 10);
188        assert!(stats.index_stats.get("hnsw_primary").unwrap().entry_count >= 10);
189    }
190
191    #[test]
192    fn test_get_stats_returns_defaults_on_error() {
193        let temp_dir = TempDir::new().unwrap();
194        let collection =
195            Collection::create(temp_dir.path().to_path_buf(), 128, DistanceMetric::Cosine).unwrap();
196
197        let stats = collection.get_stats();
198
199        // Should not panic, returns default on any issue
200        assert_eq!(stats.live_row_count(), 0);
201    }
202}