1use std::collections::HashMap;
4
5use instant::SystemTime;
6use rand::SeedableRng;
7
8use super::{histogram::BucketStrategy, ColumnStatistics, SampleMetadata, SamplingConfig};
9
10#[derive(Debug, Clone)]
12pub struct TableStatistics {
13 pub row_count: usize,
15
16 pub columns: HashMap<String, ColumnStatistics>,
18
19 pub last_updated: SystemTime,
21
22 pub is_stale: bool,
24
25 pub sample_metadata: Option<SampleMetadata>,
28
29 pub avg_row_bytes: Option<f64>,
39}
40
41impl TableStatistics {
42 pub fn estimate_from_schema(row_count: usize, schema: &vibesql_catalog::TableSchema) -> Self {
74 use vibesql_types::DataType;
75
76 let mut columns = std::collections::HashMap::new();
77
78 for col in &schema.columns {
79 let n_distinct = match &col.data_type {
80 DataType::Boolean => 2,
81 DataType::Integer | DataType::Smallint | DataType::Bigint | DataType::Unsigned => {
82 ((row_count as f64).sqrt() as usize).max(1)
84 }
85 DataType::Float { .. } | DataType::Real | DataType::DoublePrecision => {
86 let sqrt_count = (row_count as f64).sqrt() as usize;
89 sqrt_count.max(100).min(row_count)
90 }
91 DataType::Numeric { .. } | DataType::Decimal { .. } => {
92 ((row_count as f64).sqrt() as usize).max(1)
94 }
95 DataType::Varchar { .. }
96 | DataType::Character { .. }
97 | DataType::Name
98 | DataType::CharacterLargeObject => {
99 ((row_count as f64 * 0.5) as usize).max(1)
101 }
102 DataType::Date | DataType::Timestamp { .. } | DataType::Time { .. } => {
103 ((row_count as f64 * 0.8) as usize).max(1)
105 }
106 _ => {
107 ((row_count as f64).sqrt() as usize).max(1)
109 }
110 };
111
112 let null_count = if col.nullable {
114 ((row_count as f64 * 0.01) as usize).max(0)
116 } else {
117 0
118 };
119
120 let col_stats = ColumnStatistics {
121 n_distinct: n_distinct.max(1), null_count,
123 min_value: None, max_value: None,
125 most_common_values: Vec::new(), histogram: None, };
128
129 columns.insert(col.name.clone(), col_stats);
130 }
131
132 TableStatistics {
133 row_count,
134 columns,
135 last_updated: SystemTime::now(),
136 is_stale: true, sample_metadata: None,
138 avg_row_bytes: None, }
140 }
141
142 pub fn compute(rows: &[crate::Row], schema: &vibesql_catalog::TableSchema) -> Self {
144 Self::compute_with_config(rows, schema, None, false, 100, BucketStrategy::EqualDepth)
145 }
146
147 pub fn compute_with_config(
157 rows: &[crate::Row],
158 schema: &vibesql_catalog::TableSchema,
159 sampling_config: Option<SamplingConfig>,
160 enable_histograms: bool,
161 histogram_buckets: usize,
162 bucket_strategy: BucketStrategy,
163 ) -> Self {
164 use super::sampling::sample_rows;
165 let total_rows = rows.len();
166 let config = sampling_config.unwrap_or_else(SamplingConfig::adaptive);
167
168 let (sample_size, should_sample) = config.determine_sample_size(total_rows);
170
171 let mut rng = rand::rngs::StdRng::from_os_rng();
173 let sampled_rows =
174 if should_sample { sample_rows(rows, &config, &mut rng) } else { rows.to_vec() };
175
176 let sample_metadata = if should_sample {
178 Some(SampleMetadata::new(total_rows, sample_size, true, config.confidence_level))
179 } else {
180 None
181 };
182
183 let mut columns = HashMap::new();
185 for (idx, column) in schema.columns.iter().enumerate() {
186 let col_stats = ColumnStatistics::compute_with_histogram(
187 &sampled_rows,
188 idx,
189 enable_histograms,
190 histogram_buckets,
191 bucket_strategy.clone(),
192 );
193 columns.insert(column.name.clone(), col_stats);
194 }
195
196 let avg_row_bytes = if sampled_rows.is_empty() {
198 None
199 } else {
200 let total_bytes: usize =
201 sampled_rows.iter().map(|row| row.estimated_size_bytes()).sum();
202 Some(total_bytes as f64 / sampled_rows.len() as f64)
203 };
204
205 TableStatistics {
206 row_count: total_rows,
207 columns,
208 last_updated: SystemTime::now(),
209 is_stale: false,
210 sample_metadata,
211 avg_row_bytes,
212 }
213 }
214
215 pub fn compute_sampled(rows: &[crate::Row], schema: &vibesql_catalog::TableSchema) -> Self {
222 Self::compute_with_config(
223 rows,
224 schema,
225 Some(SamplingConfig::adaptive()),
226 false,
227 100,
228 BucketStrategy::EqualDepth,
229 )
230 }
231
232 pub fn compute_full_featured(
234 rows: &[crate::Row],
235 schema: &vibesql_catalog::TableSchema,
236 ) -> Self {
237 Self::compute_with_config(
238 rows,
239 schema,
240 Some(SamplingConfig::adaptive()),
241 true, 100, BucketStrategy::EqualDepth,
244 )
245 }
246
247 pub fn estimate_from_row_count(row_count: usize) -> Self {
269 TableStatistics {
270 row_count,
271 columns: HashMap::new(), last_updated: SystemTime::now(),
273 is_stale: true, sample_metadata: None,
275 avg_row_bytes: None, }
277 }
278
279 pub fn mark_stale(&mut self) {
281 self.is_stale = true;
282 }
283
284 pub fn needs_refresh(&self) -> bool {
288 self.is_stale
289 }
290}
291
292#[cfg(test)]
293mod tests {
294 use vibesql_catalog::{ColumnSchema, TableSchema};
295 use vibesql_types::{DataType, SqlValue};
296
297 use super::*;
298 use crate::Row;
299
300 #[test]
301 fn test_table_statistics() {
302 let schema = TableSchema::new(
303 "test_table".to_string(),
304 vec![
305 ColumnSchema::new("id".to_string(), DataType::Integer, false),
306 ColumnSchema::new(
307 "name".to_string(),
308 DataType::Varchar { max_length: Some(100) },
309 true,
310 ),
311 ],
312 );
313
314 let rows = vec![
315 Row::new(vec![SqlValue::Integer(1), SqlValue::Varchar(arcstr::ArcStr::from("Alice"))]),
316 Row::new(vec![SqlValue::Integer(2), SqlValue::Varchar(arcstr::ArcStr::from("Bob"))]),
317 Row::new(vec![SqlValue::Integer(3), SqlValue::Varchar(arcstr::ArcStr::from("Alice"))]),
318 ];
319
320 let stats = TableStatistics::compute(&rows, &schema);
321
322 assert_eq!(stats.row_count, 3);
323 assert_eq!(stats.columns.len(), 2);
324 assert!(!stats.is_stale);
325
326 let id_stats = stats.columns.get("id").unwrap();
328 assert_eq!(id_stats.n_distinct, 3);
329
330 let name_stats = stats.columns.get("name").unwrap();
331 assert_eq!(name_stats.n_distinct, 2); }
333
334 #[test]
335 fn test_mark_stale() {
336 let schema = TableSchema::new("test".to_string(), vec![]);
337
338 let mut stats = TableStatistics::compute(&[], &schema);
339 assert!(!stats.is_stale);
340 assert!(!stats.needs_refresh());
341
342 stats.mark_stale();
343 assert!(stats.is_stale);
344 assert!(stats.needs_refresh());
345 }
346
347 #[test]
348 fn test_estimate_from_schema_basic() {
349 let schema = TableSchema::new(
350 "test_table".to_string(),
351 vec![
352 ColumnSchema::new("id".to_string(), DataType::Integer, false),
353 ColumnSchema::new(
354 "name".to_string(),
355 DataType::Varchar { max_length: Some(100) },
356 true,
357 ),
358 ColumnSchema::new("active".to_string(), DataType::Boolean, false),
359 ],
360 );
361
362 let stats = TableStatistics::estimate_from_schema(1000, &schema);
363
364 assert_eq!(stats.row_count, 1000);
365 assert!(stats.is_stale); assert_eq!(stats.columns.len(), 3);
367
368 let id_stats = stats.columns.get("id").unwrap();
370 assert_eq!(id_stats.n_distinct, 31); assert_eq!(id_stats.null_count, 0); let name_stats = stats.columns.get("name").unwrap();
375 assert_eq!(name_stats.n_distinct, 500); assert!(name_stats.null_count > 0); let active_stats = stats.columns.get("active").unwrap();
380 assert_eq!(active_stats.n_distinct, 2);
381 assert_eq!(active_stats.null_count, 0); }
383
384 #[test]
385 fn test_estimate_from_schema_various_types() {
386 let schema = TableSchema::new(
387 "test_table".to_string(),
388 vec![
389 ColumnSchema::new("bool_col".to_string(), DataType::Boolean, false),
390 ColumnSchema::new("int_col".to_string(), DataType::Integer, false),
391 ColumnSchema::new(
392 "float_col".to_string(),
393 DataType::Float { precision: 24 },
394 false,
395 ),
396 ColumnSchema::new("date_col".to_string(), DataType::Date, false),
397 ColumnSchema::new(
398 "nullable_col".to_string(),
399 DataType::Varchar { max_length: Some(50) },
400 true,
401 ),
402 ],
403 );
404
405 let stats = TableStatistics::estimate_from_schema(10000, &schema);
406
407 assert_eq!(stats.columns.get("bool_col").unwrap().n_distinct, 2);
409
410 assert_eq!(stats.columns.get("int_col").unwrap().n_distinct, 100);
412
413 let float_ndv = stats.columns.get("float_col").unwrap().n_distinct;
415 assert!(float_ndv >= 100);
416
417 let date_ndv = stats.columns.get("date_col").unwrap().n_distinct;
419 assert!(date_ndv > 5000);
420
421 let nullable_stats = stats.columns.get("nullable_col").unwrap();
423 assert!(nullable_stats.null_count > 0);
424 }
425
426 #[test]
427 fn test_estimate_from_schema_empty_table() {
428 let schema = TableSchema::new(
429 "empty_table".to_string(),
430 vec![ColumnSchema::new("id".to_string(), DataType::Integer, false)],
431 );
432
433 let stats = TableStatistics::estimate_from_schema(0, &schema);
434 assert_eq!(stats.row_count, 0);
435 assert!(stats.is_stale);
436
437 let id_stats = stats.columns.get("id").unwrap();
439 assert!(id_stats.n_distinct >= 1);
440 }
441
442 #[test]
443 fn test_estimate_from_row_count() {
444 let stats = TableStatistics::estimate_from_row_count(1000);
446
447 assert_eq!(stats.row_count, 1000);
448 assert!(stats.columns.is_empty()); assert!(stats.is_stale); assert!(stats.sample_metadata.is_none());
451 assert!(stats.needs_refresh()); }
453
454 #[test]
455 fn test_estimate_from_row_count_zero_rows() {
456 let stats = TableStatistics::estimate_from_row_count(0);
458
459 assert_eq!(stats.row_count, 0);
460 assert!(stats.is_stale);
461 }
462
463 #[test]
468 fn test_avg_row_bytes_computed_from_actual_data() {
469 let schema = TableSchema::new(
470 "test_table".to_string(),
471 vec![
472 ColumnSchema::new("id".to_string(), DataType::Integer, false),
473 ColumnSchema::new(
474 "name".to_string(),
475 DataType::Varchar { max_length: Some(100) },
476 false,
477 ),
478 ],
479 );
480
481 let rows = vec![
482 Row::new(vec![SqlValue::Integer(1), SqlValue::Varchar(arcstr::ArcStr::from("Alice"))]),
483 Row::new(vec![SqlValue::Integer(2), SqlValue::Varchar(arcstr::ArcStr::from("Bob"))]),
484 Row::new(vec![
485 SqlValue::Integer(3),
486 SqlValue::Varchar(arcstr::ArcStr::from("Charlie")),
487 ]),
488 ];
489
490 let stats = TableStatistics::compute(&rows, &schema);
491
492 assert!(stats.avg_row_bytes.is_some());
494 let avg_bytes = stats.avg_row_bytes.unwrap();
495 assert!(avg_bytes > 0.0, "avg_row_bytes should be positive: {}", avg_bytes);
497 }
498
499 #[test]
500 fn test_avg_row_bytes_none_for_schema_estimates() {
501 let schema = TableSchema::new(
502 "test_table".to_string(),
503 vec![
504 ColumnSchema::new("id".to_string(), DataType::Integer, false),
505 ColumnSchema::new(
506 "name".to_string(),
507 DataType::Varchar { max_length: Some(100) },
508 false,
509 ),
510 ],
511 );
512
513 let stats = TableStatistics::estimate_from_schema(1000, &schema);
515 assert!(
516 stats.avg_row_bytes.is_none(),
517 "estimate_from_schema should not have avg_row_bytes"
518 );
519
520 let stats = TableStatistics::estimate_from_row_count(1000);
522 assert!(
523 stats.avg_row_bytes.is_none(),
524 "estimate_from_row_count should not have avg_row_bytes"
525 );
526 }
527
528 #[test]
529 fn test_avg_row_bytes_none_for_empty_table() {
530 let schema = TableSchema::new(
531 "empty_table".to_string(),
532 vec![ColumnSchema::new("id".to_string(), DataType::Integer, false)],
533 );
534
535 let stats = TableStatistics::compute(&[], &schema);
537 assert!(stats.avg_row_bytes.is_none(), "Empty table should have avg_row_bytes = None");
538 }
539
540 #[test]
541 fn test_avg_row_bytes_varies_with_string_length() {
542 let schema = TableSchema::new(
543 "test_table".to_string(),
544 vec![
545 ColumnSchema::new("id".to_string(), DataType::Integer, false),
546 ColumnSchema::new(
547 "data".to_string(),
548 DataType::Varchar { max_length: Some(1000) },
549 false,
550 ),
551 ],
552 );
553
554 let short_rows = vec![
556 Row::new(vec![SqlValue::Integer(1), SqlValue::Varchar(arcstr::ArcStr::from("a"))]),
557 Row::new(vec![SqlValue::Integer(2), SqlValue::Varchar(arcstr::ArcStr::from("b"))]),
558 ];
559 let short_stats = TableStatistics::compute(&short_rows, &schema);
560
561 let long_string = "x".repeat(500);
563 let long_rows = vec![
564 Row::new(vec![
565 SqlValue::Integer(1),
566 SqlValue::Varchar(arcstr::ArcStr::from(&long_string)),
567 ]),
568 Row::new(vec![
569 SqlValue::Integer(2),
570 SqlValue::Varchar(arcstr::ArcStr::from(&long_string)),
571 ]),
572 ];
573 let long_stats = TableStatistics::compute(&long_rows, &schema);
574
575 let short_avg = short_stats.avg_row_bytes.unwrap();
577 let long_avg = long_stats.avg_row_bytes.unwrap();
578 assert!(
579 long_avg > short_avg,
580 "Long strings ({}) should have larger avg_row_bytes than short strings ({})",
581 long_avg,
582 short_avg
583 );
584 }
585}