1use std::collections::HashMap;
6use std::sync::RwLock;
7
8pub struct Statistics {
10 tables: RwLock<HashMap<String, TableStats>>,
12 columns: RwLock<HashMap<String, ColumnStats>>,
14}
15
16impl Statistics {
17 pub fn new() -> Self {
19 Self {
20 tables: RwLock::new(HashMap::new()),
21 columns: RwLock::new(HashMap::new()),
22 }
23 }
24
25 pub fn update_table_stats(&self, table_name: String, stats: TableStats) {
27 self.tables.write().unwrap().insert(table_name, stats);
28 }
29
30 pub fn get_table_stats(&self, table_name: &str) -> Option<TableStats> {
32 self.tables.read().unwrap().get(table_name).cloned()
33 }
34
35 pub fn update_column_stats(&self, column_key: String, stats: ColumnStats) {
37 self.columns.write().unwrap().insert(column_key, stats);
38 }
39
40 pub fn get_column_stats(&self, column_key: &str) -> Option<ColumnStats> {
42 self.columns.read().unwrap().get(column_key).cloned()
43 }
44
45 pub fn is_empty(&self) -> bool {
47 self.tables.read().unwrap().is_empty() && self.columns.read().unwrap().is_empty()
48 }
49
50 pub fn clear(&self) {
52 self.tables.write().unwrap().clear();
53 self.columns.write().unwrap().clear();
54 }
55
56 pub fn estimate_join_selectivity(
58 &self,
59 left_table: &str,
60 right_table: &str,
61 join_column: &str,
62 ) -> f64 {
63 let left_stats = self.get_table_stats(left_table);
64 let right_stats = self.get_table_stats(right_table);
65
66 if let (Some(left), Some(right)) = (left_stats, right_stats) {
67 let left_ndv = left.row_count as f64;
69 let right_ndv = right.row_count as f64;
70
71 if left_ndv > 0.0 && right_ndv > 0.0 {
72 1.0 / left_ndv.max(right_ndv)
73 } else {
74 0.1 }
76 } else {
77 0.1 }
79 }
80
81 pub fn estimate_filter_selectivity(&self, column_key: &str, operator: &str) -> f64 {
83 if let Some(stats) = self.get_column_stats(column_key) {
84 match operator {
85 "=" => 1.0 / stats.ndv.max(1) as f64,
86 ">" | "<" => 0.33,
87 ">=" | "<=" => 0.33,
88 "!=" => 1.0 - (1.0 / stats.ndv.max(1) as f64),
89 "LIKE" => 0.1,
90 "IN" => 0.2,
91 _ => 0.1,
92 }
93 } else {
94 0.1 }
96 }
97}
98
99impl Default for Statistics {
100 fn default() -> Self {
101 Self::new()
102 }
103}
104
105#[derive(Debug, Clone)]
107pub struct TableStats {
108 pub row_count: usize,
110 pub avg_row_size: usize,
112 pub total_size: usize,
114 pub ndv: usize,
116 pub last_updated: std::time::SystemTime,
118}
119
120impl TableStats {
121 pub fn new(row_count: usize, avg_row_size: usize) -> Self {
123 Self {
124 row_count,
125 avg_row_size,
126 total_size: row_count * avg_row_size,
127 ndv: row_count, last_updated: std::time::SystemTime::now(),
129 }
130 }
131
132 pub fn update_row_count(&mut self, row_count: usize) {
134 self.row_count = row_count;
135 self.total_size = row_count * self.avg_row_size;
136 self.last_updated = std::time::SystemTime::now();
137 }
138
139 pub fn estimate_scan_cost(&self) -> f64 {
141 self.row_count as f64 * 0.001 }
143}
144
145#[derive(Debug, Clone)]
147pub struct ColumnStats {
148 pub ndv: usize,
150 pub null_count: usize,
152 pub min_value: Option<ColumnValue>,
154 pub max_value: Option<ColumnValue>,
156 pub histogram: Option<Histogram>,
158 pub mcv: Vec<(ColumnValue, usize)>,
160}
161
162impl ColumnStats {
163 pub fn new(ndv: usize, null_count: usize) -> Self {
165 Self {
166 ndv,
167 null_count,
168 min_value: None,
169 max_value: None,
170 histogram: None,
171 mcv: Vec::new(),
172 }
173 }
174
175 pub fn with_range(mut self, min: ColumnValue, max: ColumnValue) -> Self {
177 self.min_value = Some(min);
178 self.max_value = Some(max);
179 self
180 }
181
182 pub fn with_histogram(mut self, histogram: Histogram) -> Self {
184 self.histogram = Some(histogram);
185 self
186 }
187
188 pub fn with_mcv(mut self, mcv: Vec<(ColumnValue, usize)>) -> Self {
190 self.mcv = mcv;
191 self
192 }
193
194 pub fn estimate_equality_selectivity(&self, value: &ColumnValue) -> f64 {
196 for (mcv_val, freq) in &self.mcv {
198 if mcv_val == value {
199 return *freq as f64 / self.ndv as f64;
200 }
201 }
202
203 if self.ndv > 0 {
205 1.0 / self.ndv as f64
206 } else {
207 0.0
208 }
209 }
210
211 pub fn estimate_range_selectivity(&self, start: &ColumnValue, end: &ColumnValue) -> f64 {
213 if let Some(histogram) = &self.histogram {
214 histogram.estimate_range_selectivity(start, end)
215 } else {
216 0.33 }
218 }
219}
220
221#[derive(Debug, Clone, PartialEq, PartialOrd)]
223pub enum ColumnValue {
224 Int64(i64),
225 Float64(f64),
226 String(String),
227 Boolean(bool),
228}
229
230#[derive(Debug, Clone)]
232pub struct Histogram {
233 pub buckets: Vec<HistogramBucket>,
235 pub total_count: usize,
237}
238
239impl Histogram {
240 pub fn new(buckets: Vec<HistogramBucket>, total_count: usize) -> Self {
242 Self {
243 buckets,
244 total_count,
245 }
246 }
247
248 pub fn equi_width(min: f64, max: f64, num_buckets: usize, values: &[f64]) -> Self {
250 let width = (max - min) / num_buckets as f64;
251 let mut buckets = Vec::with_capacity(num_buckets);
252
253 for i in 0..num_buckets {
254 let lower = min + i as f64 * width;
255 let upper = if i == num_buckets - 1 {
256 max
257 } else {
258 min + (i + 1) as f64 * width
259 };
260
261 let count = values.iter().filter(|&&v| v >= lower && v < upper).count();
262 let ndv = values
264 .iter()
265 .filter(|&&v| v >= lower && v < upper)
266 .map(|&v| ordered_float::OrderedFloat(v))
267 .collect::<std::collections::BTreeSet<_>>()
268 .len();
269
270 buckets.push(HistogramBucket {
271 lower_bound: ColumnValue::Float64(lower),
272 upper_bound: ColumnValue::Float64(upper),
273 count,
274 ndv,
275 });
276 }
277
278 Self {
279 buckets,
280 total_count: values.len(),
281 }
282 }
283
284 pub fn estimate_range_selectivity(&self, start: &ColumnValue, end: &ColumnValue) -> f64 {
286 if self.total_count == 0 {
287 return 0.0;
288 }
289
290 let mut matching_count = 0;
291 for bucket in &self.buckets {
292 if bucket.overlaps(start, end) {
293 matching_count += bucket.count;
294 }
295 }
296
297 matching_count as f64 / self.total_count as f64
298 }
299
300 pub fn num_buckets(&self) -> usize {
302 self.buckets.len()
303 }
304}
305
306#[derive(Debug, Clone)]
308pub struct HistogramBucket {
309 pub lower_bound: ColumnValue,
311 pub upper_bound: ColumnValue,
313 pub count: usize,
315 pub ndv: usize,
317}
318
319impl HistogramBucket {
320 pub fn overlaps(&self, start: &ColumnValue, end: &ColumnValue) -> bool {
322 self.lower_bound <= *end && self.upper_bound >= *start
324 }
325
326 pub fn width(&self) -> Option<f64> {
328 match (&self.lower_bound, &self.upper_bound) {
329 (ColumnValue::Float64(lower), ColumnValue::Float64(upper)) => Some(upper - lower),
330 (ColumnValue::Int64(lower), ColumnValue::Int64(upper)) => Some((upper - lower) as f64),
331 _ => None,
332 }
333 }
334}
335
336#[cfg(test)]
337mod tests {
338 use super::*;
339
340 #[test]
341 fn test_statistics_creation() {
342 let stats = Statistics::new();
343 assert!(stats.is_empty());
344 }
345
346 #[test]
347 fn test_table_stats() {
348 let stats = Statistics::new();
349 let table_stats = TableStats::new(1000, 128);
350
351 stats.update_table_stats("nodes".to_string(), table_stats.clone());
352
353 let retrieved = stats.get_table_stats("nodes");
354 assert!(retrieved.is_some());
355 assert_eq!(retrieved.unwrap().row_count, 1000);
356 }
357
358 #[test]
359 fn test_column_stats() {
360 let stats = Statistics::new();
361 let col_stats = ColumnStats::new(500, 10);
362
363 stats.update_column_stats("nodes.id".to_string(), col_stats);
364
365 let retrieved = stats.get_column_stats("nodes.id");
366 assert!(retrieved.is_some());
367 assert_eq!(retrieved.unwrap().ndv, 500);
368 }
369
370 #[test]
371 fn test_histogram_creation() {
372 let values = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
373 let histogram = Histogram::equi_width(1.0, 10.0, 5, &values);
374
375 assert_eq!(histogram.num_buckets(), 5);
376 assert_eq!(histogram.total_count, 10);
377 }
378
379 #[test]
380 fn test_selectivity_estimation() {
381 let stats = Statistics::new();
382 let table_stats = TableStats::new(1000, 128);
383
384 stats.update_table_stats("nodes".to_string(), table_stats);
385
386 let selectivity = stats.estimate_join_selectivity("nodes", "edges", "id");
387 assert!(selectivity > 0.0 && selectivity <= 1.0);
388 }
389
390 #[test]
391 fn test_filter_selectivity() {
392 let stats = Statistics::new();
393 let col_stats = ColumnStats::new(100, 5);
394
395 stats.update_column_stats("nodes.age".to_string(), col_stats);
396
397 let selectivity = stats.estimate_filter_selectivity("nodes.age", "=");
398 assert_eq!(selectivity, 0.01); }
400}