vibesql_storage/statistics/
table.rs

1//! Table-level statistics
2
3use std::collections::HashMap;
4
5use instant::SystemTime;
6use rand::SeedableRng;
7
8use super::{histogram::BucketStrategy, ColumnStatistics, SampleMetadata, SamplingConfig};
9
10/// Statistics for an entire table
11#[derive(Debug, Clone)]
12pub struct TableStatistics {
13    /// Total number of rows
14    pub row_count: usize,
15
16    /// Per-column statistics
17    pub columns: HashMap<String, ColumnStatistics>,
18
19    /// Timestamp when stats were last updated
20    pub last_updated: SystemTime,
21
22    /// Whether stats are stale (need recomputation)
23    pub is_stale: bool,
24
25    /// Sampling metadata (Phase 5.2)
26    /// None if no sampling was used (small table)
27    pub sample_metadata: Option<SampleMetadata>,
28
29    /// Average row size in bytes (computed from sampled data)
30    ///
31    /// This provides actual row size measurements that account for:
32    /// - Real string/varchar fill ratios (not heuristic estimates)
33    /// - Actual NULL prevalence
34    /// - True BLOB/CLOB sizes
35    ///
36    /// Used by DML cost estimation to scale WAL write costs.
37    /// None if statistics were estimated from schema (no actual data sampled).
38    pub avg_row_bytes: Option<f64>,
39}
40
41impl TableStatistics {
42    /// Create estimated statistics with basic column estimates
43    ///
44    /// This method provides reasonable defaults for column statistics without
45    /// requiring a full ANALYZE scan. It uses data type information to generate
46    /// basic statistics using conservative heuristics.
47    ///
48    /// # Heuristics Used
49    /// - **Boolean columns**: n_distinct = 2
50    /// - **Integer/Smallint/Bigint/Unsigned columns**: n_distinct = sqrt(row_count) (conservative)
51    /// - **Float/Real/DoublePrecision columns**: n_distinct = sqrt(row_count) to 100 (high
52    ///   cardinality)
53    /// - **Varchar/Character/Name columns**: n_distinct = row_count * 0.5 (assume moderate
54    ///   uniqueness)
55    /// - **Date/Timestamp/Time columns**: n_distinct = row_count * 0.8 (high cardinality)
56    /// - **Numeric/Decimal columns**: n_distinct = sqrt(row_count) (moderate)
57    /// - **Nullable columns**: null_count ≈ row_count * 0.01 (1% estimated nulls)
58    /// - **Non-nullable columns**: null_count = 0
59    /// - **All columns**: is_stale = true (clearly marked as estimates)
60    ///
61    /// # Arguments
62    /// * `row_count` - Total number of rows in the table
63    /// * `schema` - Table schema with column definitions
64    ///
65    /// # Example
66    /// ```text
67    /// let stats = TableStatistics::estimate_from_schema(5000, &schema);
68    /// // Boolean col: n_distinct = 2
69    /// // Integer col: n_distinct = sqrt(5000) ≈ 70
70    /// // Varchar col: n_distinct = 2500
71    /// // All columns: is_stale = true
72    /// ```
73    pub fn estimate_from_schema(row_count: usize, schema: &vibesql_catalog::TableSchema) -> Self {
74        use vibesql_types::DataType;
75
76        let mut columns = std::collections::HashMap::new();
77
78        for col in &schema.columns {
79            let n_distinct = match &col.data_type {
80                DataType::Boolean => 2,
81                DataType::Integer | DataType::Smallint | DataType::Bigint | DataType::Unsigned => {
82                    // Conservative: sqrt(row_count) - typically much less than actual cardinality
83                    ((row_count as f64).sqrt() as usize).max(1)
84                }
85                DataType::Float { .. } | DataType::Real | DataType::DoublePrecision => {
86                    // Floating point often has high cardinality
87                    // Use sqrt but ensure at least 100 and at most row_count
88                    let sqrt_count = (row_count as f64).sqrt() as usize;
89                    sqrt_count.max(100).min(row_count)
90                }
91                DataType::Numeric { .. } | DataType::Decimal { .. } => {
92                    // Numeric with precision/scale: moderate cardinality
93                    ((row_count as f64).sqrt() as usize).max(1)
94                }
95                DataType::Varchar { .. }
96                | DataType::Character { .. }
97                | DataType::Name
98                | DataType::CharacterLargeObject => {
99                    // String columns: assume moderate uniqueness (50%)
100                    ((row_count as f64 * 0.5) as usize).max(1)
101                }
102                DataType::Date | DataType::Timestamp { .. } | DataType::Time { .. } => {
103                    // Temporal types: high cardinality
104                    ((row_count as f64 * 0.8) as usize).max(1)
105                }
106                _ => {
107                    // Other types: conservative estimate
108                    ((row_count as f64).sqrt() as usize).max(1)
109                }
110            };
111
112            // Estimate null fraction based on nullability
113            let null_count = if col.nullable {
114                // Estimate 1% nulls for nullable columns
115                ((row_count as f64 * 0.01) as usize).max(0)
116            } else {
117                0
118            };
119
120            let col_stats = ColumnStatistics {
121                n_distinct: n_distinct.max(1), // At least 1 distinct value
122                null_count,
123                min_value: None, // No range info without scanning
124                max_value: None,
125                most_common_values: Vec::new(), // No MCVs without scanning
126                histogram: None,                // No histogram without scanning
127            };
128
129            columns.insert(col.name.clone(), col_stats);
130        }
131
132        TableStatistics {
133            row_count,
134            columns,
135            last_updated: SystemTime::now(),
136            is_stale: true, // Clearly marked as estimates
137            sample_metadata: None,
138            avg_row_bytes: None, // No actual data sampled
139        }
140    }
141
142    /// Compute statistics by scanning the table
143    pub fn compute(rows: &[crate::Row], schema: &vibesql_catalog::TableSchema) -> Self {
144        Self::compute_with_config(rows, schema, None, false, 100, BucketStrategy::EqualDepth)
145    }
146
147    /// Compute statistics with sampling (Phase 5.2) and histogram support (Phase 5.1)
148    ///
149    /// # Arguments
150    /// * `rows` - All table rows
151    /// * `schema` - Table schema
152    /// * `sampling_config` - Optional sampling configuration (None = adaptive)
153    /// * `enable_histograms` - Whether to build histograms
154    /// * `histogram_buckets` - Number of histogram buckets
155    /// * `bucket_strategy` - Histogram bucketing strategy
156    pub fn compute_with_config(
157        rows: &[crate::Row],
158        schema: &vibesql_catalog::TableSchema,
159        sampling_config: Option<SamplingConfig>,
160        enable_histograms: bool,
161        histogram_buckets: usize,
162        bucket_strategy: BucketStrategy,
163    ) -> Self {
164        use super::sampling::sample_rows;
165        let total_rows = rows.len();
166        let config = sampling_config.unwrap_or_else(SamplingConfig::adaptive);
167
168        // Determine if sampling is needed
169        let (sample_size, should_sample) = config.determine_sample_size(total_rows);
170
171        // Sample rows if needed (Phase 5.2)
172        let mut rng = rand::rngs::StdRng::from_os_rng();
173        let sampled_rows =
174            if should_sample { sample_rows(rows, &config, &mut rng) } else { rows.to_vec() };
175
176        // Create sample metadata
177        let sample_metadata = if should_sample {
178            Some(SampleMetadata::new(total_rows, sample_size, true, config.confidence_level))
179        } else {
180            None
181        };
182
183        // Compute column statistics on the sample
184        let mut columns = HashMap::new();
185        for (idx, column) in schema.columns.iter().enumerate() {
186            let col_stats = ColumnStatistics::compute_with_histogram(
187                &sampled_rows,
188                idx,
189                enable_histograms,
190                histogram_buckets,
191                bucket_strategy.clone(),
192            );
193            columns.insert(column.name.clone(), col_stats);
194        }
195
196        // Compute average row size from sampled data
197        let avg_row_bytes = if sampled_rows.is_empty() {
198            None
199        } else {
200            let total_bytes: usize =
201                sampled_rows.iter().map(|row| row.estimated_size_bytes()).sum();
202            Some(total_bytes as f64 / sampled_rows.len() as f64)
203        };
204
205        TableStatistics {
206            row_count: total_rows,
207            columns,
208            last_updated: SystemTime::now(),
209            is_stale: false,
210            sample_metadata,
211            avg_row_bytes,
212        }
213    }
214
215    /// Compute statistics using adaptive sampling (Phase 5.2 convenience method)
216    ///
217    /// This automatically:
218    /// - Uses full scan for small tables (< 1000 rows)
219    /// - Uses 10% sample for medium tables (1K-100K rows)
220    /// - Uses fixed 10K sample for large tables (> 100K rows)
221    pub fn compute_sampled(rows: &[crate::Row], schema: &vibesql_catalog::TableSchema) -> Self {
222        Self::compute_with_config(
223            rows,
224            schema,
225            Some(SamplingConfig::adaptive()),
226            false,
227            100,
228            BucketStrategy::EqualDepth,
229        )
230    }
231
232    /// Compute statistics with both sampling and histograms enabled
233    pub fn compute_full_featured(
234        rows: &[crate::Row],
235        schema: &vibesql_catalog::TableSchema,
236    ) -> Self {
237        Self::compute_with_config(
238            rows,
239            schema,
240            Some(SamplingConfig::adaptive()),
241            true, // Enable histograms
242            100,  // 100 buckets
243            BucketStrategy::EqualDepth,
244        )
245    }
246
247    /// Create estimated statistics from table metadata without full ANALYZE
248    ///
249    /// This provides a fallback for cost estimation when detailed statistics
250    /// aren't available (i.e., ANALYZE hasn't been run). It uses the table's
251    /// row count and provides conservative defaults for other fields.
252    ///
253    /// # Use Cases
254    /// - DML cost estimation when ANALYZE hasn't been run
255    /// - Quick cost comparisons before detailed statistics are available
256    ///
257    /// # Limitations
258    /// - No per-column statistics (empty columns map)
259    /// - No histogram data
260    /// - Marked as stale to indicate these are estimates
261    ///
262    /// # Example
263    /// ```text
264    /// let table_stats = table.get_statistics()
265    ///     .cloned()
266    ///     .unwrap_or_else(|| TableStatistics::estimate_from_row_count(table.row_count()));
267    /// ```
268    pub fn estimate_from_row_count(row_count: usize) -> Self {
269        TableStatistics {
270            row_count,
271            columns: HashMap::new(), // No per-column stats without ANALYZE
272            last_updated: SystemTime::now(),
273            is_stale: true, // Mark as stale since these are estimates
274            sample_metadata: None,
275            avg_row_bytes: None, // No actual data sampled
276        }
277    }
278
279    /// Mark statistics as stale after significant data changes
280    pub fn mark_stale(&mut self) {
281        self.is_stale = true;
282    }
283
284    /// Check if statistics should be recomputed
285    ///
286    /// Returns true if stats are marked stale or too old
287    pub fn needs_refresh(&self) -> bool {
288        self.is_stale
289    }
290}
291
292#[cfg(test)]
293mod tests {
294    use vibesql_catalog::{ColumnSchema, TableSchema};
295    use vibesql_types::{DataType, SqlValue};
296
297    use super::*;
298    use crate::Row;
299
300    #[test]
301    fn test_table_statistics() {
302        let schema = TableSchema::new(
303            "test_table".to_string(),
304            vec![
305                ColumnSchema::new("id".to_string(), DataType::Integer, false),
306                ColumnSchema::new(
307                    "name".to_string(),
308                    DataType::Varchar { max_length: Some(100) },
309                    true,
310                ),
311            ],
312        );
313
314        let rows = vec![
315            Row::new(vec![SqlValue::Integer(1), SqlValue::Varchar(arcstr::ArcStr::from("Alice"))]),
316            Row::new(vec![SqlValue::Integer(2), SqlValue::Varchar(arcstr::ArcStr::from("Bob"))]),
317            Row::new(vec![SqlValue::Integer(3), SqlValue::Varchar(arcstr::ArcStr::from("Alice"))]),
318        ];
319
320        let stats = TableStatistics::compute(&rows, &schema);
321
322        assert_eq!(stats.row_count, 3);
323        assert_eq!(stats.columns.len(), 2);
324        assert!(!stats.is_stale);
325
326        // Check column stats
327        let id_stats = stats.columns.get("id").unwrap();
328        assert_eq!(id_stats.n_distinct, 3);
329
330        let name_stats = stats.columns.get("name").unwrap();
331        assert_eq!(name_stats.n_distinct, 2); // Alice, Bob
332    }
333
334    #[test]
335    fn test_mark_stale() {
336        let schema = TableSchema::new("test".to_string(), vec![]);
337
338        let mut stats = TableStatistics::compute(&[], &schema);
339        assert!(!stats.is_stale);
340        assert!(!stats.needs_refresh());
341
342        stats.mark_stale();
343        assert!(stats.is_stale);
344        assert!(stats.needs_refresh());
345    }
346
347    #[test]
348    fn test_estimate_from_schema_basic() {
349        let schema = TableSchema::new(
350            "test_table".to_string(),
351            vec![
352                ColumnSchema::new("id".to_string(), DataType::Integer, false),
353                ColumnSchema::new(
354                    "name".to_string(),
355                    DataType::Varchar { max_length: Some(100) },
356                    true,
357                ),
358                ColumnSchema::new("active".to_string(), DataType::Boolean, false),
359            ],
360        );
361
362        let stats = TableStatistics::estimate_from_schema(1000, &schema);
363
364        assert_eq!(stats.row_count, 1000);
365        assert!(stats.is_stale); // Should be marked as estimates
366        assert_eq!(stats.columns.len(), 3);
367
368        // Check Integer column: n_distinct = sqrt(1000) ≈ 31
369        let id_stats = stats.columns.get("id").unwrap();
370        assert_eq!(id_stats.n_distinct, 31); // sqrt(1000) ≈ 31
371        assert_eq!(id_stats.null_count, 0); // Non-nullable
372
373        // Check Varchar column: n_distinct = 1000 * 0.5 = 500
374        let name_stats = stats.columns.get("name").unwrap();
375        assert_eq!(name_stats.n_distinct, 500); // Moderate uniqueness
376        assert!(name_stats.null_count > 0); // Nullable, so ~1% nulls = ~10
377
378        // Check Boolean column: n_distinct = 2
379        let active_stats = stats.columns.get("active").unwrap();
380        assert_eq!(active_stats.n_distinct, 2);
381        assert_eq!(active_stats.null_count, 0); // Non-nullable
382    }
383
384    #[test]
385    fn test_estimate_from_schema_various_types() {
386        let schema = TableSchema::new(
387            "test_table".to_string(),
388            vec![
389                ColumnSchema::new("bool_col".to_string(), DataType::Boolean, false),
390                ColumnSchema::new("int_col".to_string(), DataType::Integer, false),
391                ColumnSchema::new(
392                    "float_col".to_string(),
393                    DataType::Float { precision: 24 },
394                    false,
395                ),
396                ColumnSchema::new("date_col".to_string(), DataType::Date, false),
397                ColumnSchema::new(
398                    "nullable_col".to_string(),
399                    DataType::Varchar { max_length: Some(50) },
400                    true,
401                ),
402            ],
403        );
404
405        let stats = TableStatistics::estimate_from_schema(10000, &schema);
406
407        // Boolean: 2 distinct
408        assert_eq!(stats.columns.get("bool_col").unwrap().n_distinct, 2);
409
410        // Integer: sqrt(10000) = 100
411        assert_eq!(stats.columns.get("int_col").unwrap().n_distinct, 100);
412
413        // Float: high cardinality (at least 100)
414        let float_ndv = stats.columns.get("float_col").unwrap().n_distinct;
415        assert!(float_ndv >= 100);
416
417        // Date: high cardinality (80%)
418        let date_ndv = stats.columns.get("date_col").unwrap().n_distinct;
419        assert!(date_ndv > 5000);
420
421        // Nullable: should have some null_count
422        let nullable_stats = stats.columns.get("nullable_col").unwrap();
423        assert!(nullable_stats.null_count > 0);
424    }
425
426    #[test]
427    fn test_estimate_from_schema_empty_table() {
428        let schema = TableSchema::new(
429            "empty_table".to_string(),
430            vec![ColumnSchema::new("id".to_string(), DataType::Integer, false)],
431        );
432
433        let stats = TableStatistics::estimate_from_schema(0, &schema);
434        assert_eq!(stats.row_count, 0);
435        assert!(stats.is_stale);
436
437        // Even with 0 rows, should have at least 1 distinct value estimate
438        let id_stats = stats.columns.get("id").unwrap();
439        assert!(id_stats.n_distinct >= 1);
440    }
441
442    #[test]
443    fn test_estimate_from_row_count() {
444        // Test the fallback statistics method
445        let stats = TableStatistics::estimate_from_row_count(1000);
446
447        assert_eq!(stats.row_count, 1000);
448        assert!(stats.columns.is_empty()); // No per-column stats without ANALYZE
449        assert!(stats.is_stale); // Marked as stale since these are estimates
450        assert!(stats.sample_metadata.is_none());
451        assert!(stats.needs_refresh()); // Should indicate refresh is needed
452    }
453
454    #[test]
455    fn test_estimate_from_row_count_zero_rows() {
456        // Test with empty table
457        let stats = TableStatistics::estimate_from_row_count(0);
458
459        assert_eq!(stats.row_count, 0);
460        assert!(stats.is_stale);
461    }
462
463    // ============================================================================
464    // avg_row_bytes Tests (Issue #3980)
465    // ============================================================================
466
467    #[test]
468    fn test_avg_row_bytes_computed_from_actual_data() {
469        let schema = TableSchema::new(
470            "test_table".to_string(),
471            vec![
472                ColumnSchema::new("id".to_string(), DataType::Integer, false),
473                ColumnSchema::new(
474                    "name".to_string(),
475                    DataType::Varchar { max_length: Some(100) },
476                    false,
477                ),
478            ],
479        );
480
481        let rows = vec![
482            Row::new(vec![SqlValue::Integer(1), SqlValue::Varchar(arcstr::ArcStr::from("Alice"))]),
483            Row::new(vec![SqlValue::Integer(2), SqlValue::Varchar(arcstr::ArcStr::from("Bob"))]),
484            Row::new(vec![
485                SqlValue::Integer(3),
486                SqlValue::Varchar(arcstr::ArcStr::from("Charlie")),
487            ]),
488        ];
489
490        let stats = TableStatistics::compute(&rows, &schema);
491
492        // avg_row_bytes should be computed from actual data
493        assert!(stats.avg_row_bytes.is_some());
494        let avg_bytes = stats.avg_row_bytes.unwrap();
495        // Should be positive and reasonable (Row struct + values)
496        assert!(avg_bytes > 0.0, "avg_row_bytes should be positive: {}", avg_bytes);
497    }
498
499    #[test]
500    fn test_avg_row_bytes_none_for_schema_estimates() {
501        let schema = TableSchema::new(
502            "test_table".to_string(),
503            vec![
504                ColumnSchema::new("id".to_string(), DataType::Integer, false),
505                ColumnSchema::new(
506                    "name".to_string(),
507                    DataType::Varchar { max_length: Some(100) },
508                    false,
509                ),
510            ],
511        );
512
513        // estimate_from_schema should NOT have avg_row_bytes (no actual data)
514        let stats = TableStatistics::estimate_from_schema(1000, &schema);
515        assert!(
516            stats.avg_row_bytes.is_none(),
517            "estimate_from_schema should not have avg_row_bytes"
518        );
519
520        // estimate_from_row_count should NOT have avg_row_bytes
521        let stats = TableStatistics::estimate_from_row_count(1000);
522        assert!(
523            stats.avg_row_bytes.is_none(),
524            "estimate_from_row_count should not have avg_row_bytes"
525        );
526    }
527
528    #[test]
529    fn test_avg_row_bytes_none_for_empty_table() {
530        let schema = TableSchema::new(
531            "empty_table".to_string(),
532            vec![ColumnSchema::new("id".to_string(), DataType::Integer, false)],
533        );
534
535        // Empty table should have avg_row_bytes = None
536        let stats = TableStatistics::compute(&[], &schema);
537        assert!(stats.avg_row_bytes.is_none(), "Empty table should have avg_row_bytes = None");
538    }
539
540    #[test]
541    fn test_avg_row_bytes_varies_with_string_length() {
542        let schema = TableSchema::new(
543            "test_table".to_string(),
544            vec![
545                ColumnSchema::new("id".to_string(), DataType::Integer, false),
546                ColumnSchema::new(
547                    "data".to_string(),
548                    DataType::Varchar { max_length: Some(1000) },
549                    false,
550                ),
551            ],
552        );
553
554        // Short strings
555        let short_rows = vec![
556            Row::new(vec![SqlValue::Integer(1), SqlValue::Varchar(arcstr::ArcStr::from("a"))]),
557            Row::new(vec![SqlValue::Integer(2), SqlValue::Varchar(arcstr::ArcStr::from("b"))]),
558        ];
559        let short_stats = TableStatistics::compute(&short_rows, &schema);
560
561        // Long strings
562        let long_string = "x".repeat(500);
563        let long_rows = vec![
564            Row::new(vec![
565                SqlValue::Integer(1),
566                SqlValue::Varchar(arcstr::ArcStr::from(&long_string)),
567            ]),
568            Row::new(vec![
569                SqlValue::Integer(2),
570                SqlValue::Varchar(arcstr::ArcStr::from(&long_string)),
571            ]),
572        ];
573        let long_stats = TableStatistics::compute(&long_rows, &schema);
574
575        // Long strings should result in larger avg_row_bytes
576        let short_avg = short_stats.avg_row_bytes.unwrap();
577        let long_avg = long_stats.avg_row_bytes.unwrap();
578        assert!(
579            long_avg > short_avg,
580            "Long strings ({}) should have larger avg_row_bytes than short strings ({})",
581            long_avg,
582            short_avg
583        );
584    }
585}