vibesql_executor/
dml_cost.rs

1//! DML Cost-Based Optimization
2//!
3//! This module provides cost-based optimization decisions for DML operations (INSERT, UPDATE, DELETE).
4//! It uses the cost estimation infrastructure from vibesql-storage to make runtime decisions about:
5//! - Batch sizes for INSERT operations
6//! - Strategy selection for DELETE operations
7//! - Index update optimization for UPDATE operations
8//!
9//! # Usage
10//!
11//! The `DmlOptimizer` provides methods to compute optimization hints based on estimated costs:
12//!
13//! ```rust,no_run
14//! use vibesql_executor::DmlOptimizer;
15//! use vibesql_storage::Database;
16//! use std::collections::HashSet;
17//!
18//! let db = Database::new();
19//! let optimizer = DmlOptimizer::new(&db, "table_name");
20//! let total_rows = 1000;
21//! let batch_size = optimizer.optimal_insert_batch_size(total_rows);
22//! let should_chunk = optimizer.should_chunk_delete(total_rows);
23//! // Note: compute_indexes_affected_ratio requires a TableSchema
24//! ```
25
26use vibesql_storage::{
27    statistics::{CostEstimator, TableIndexInfo, TableStatistics},
28    Database,
29};
30
31/// Cost thresholds for optimization decisions
32/// These are tuned based on TPC-C and Sysbench profiling data
33pub mod thresholds {
34    /// Cost per row above which we use smaller batch sizes for INSERT
35    /// High-index tables (>3 B-tree indexes) benefit from smaller batches
36    pub const HIGH_COST_INSERT_THRESHOLD: f64 = 0.5;
37
38    /// Maximum batch size for high-cost tables (many indexes)
39    pub const SMALL_BATCH_SIZE: usize = 100;
40
41    /// Default batch size for low-cost tables
42    pub const LARGE_BATCH_SIZE: usize = 1000;
43
44    /// Deleted ratio above which early compaction may be beneficial
45    pub const HIGH_DELETED_RATIO_THRESHOLD: f64 = 0.4;
46
47    /// Row count above which chunked deletion should be considered
48    pub const CHUNK_DELETE_ROW_THRESHOLD: usize = 10000;
49
50    /// Chunk size for large deletes to avoid long locks
51    pub const DELETE_CHUNK_SIZE: usize = 1000;
52}
53
54/// DML cost-based optimizer
55///
56/// Provides optimization decisions for DML operations based on cost estimation.
57pub struct DmlOptimizer<'a> {
58    /// Cost estimator with default parameters
59    cost_estimator: CostEstimator,
60    /// Table index information for cost calculation
61    index_info: Option<TableIndexInfo>,
62    /// Table statistics (may be None for new/small tables)
63    table_stats: Option<TableStatistics>,
64    /// Reference to database for additional lookups
65    #[allow(dead_code)]
66    db: &'a Database,
67    /// Table name being optimized
68    table_name: &'a str,
69}
70
71impl<'a> DmlOptimizer<'a> {
72    /// Create a new DML optimizer for a table
73    ///
74    /// # Arguments
75    /// * `db` - Database reference for metadata lookups
76    /// * `table_name` - Name of the table being operated on
77    ///
78    /// # Returns
79    /// DmlOptimizer instance with cost estimation configured
80    pub fn new(db: &'a Database, table_name: &'a str) -> Self {
81        let index_info = db.get_table_index_info(table_name);
82        let table_stats = db
83            .get_table(table_name)
84            .and_then(|t| t.get_statistics().cloned());
85
86        Self {
87            cost_estimator: CostEstimator::default(),
88            index_info,
89            table_stats,
90            db,
91            table_name,
92        }
93    }
94
95    /// Get or compute table statistics with fallback for missing stats
96    ///
97    /// When actual statistics are unavailable (new table, no ANALYZE run),
98    /// this creates fallback statistics based on available metadata.
99    pub fn get_stats_with_fallback(&self) -> TableStatistics {
100        if let Some(ref stats) = self.table_stats {
101            stats.clone()
102        } else {
103            // Create fallback statistics from available metadata
104            self.create_fallback_stats()
105        }
106    }
107
108    /// Create fallback statistics when actual stats are unavailable
109    fn create_fallback_stats(&self) -> TableStatistics {
110        // Get row count from table if available
111        let row_count = self
112            .db
113            .get_table(self.table_name)
114            .map(|t| t.row_count())
115            .unwrap_or(0);
116
117        // Create minimal statistics with all required fields
118        TableStatistics {
119            row_count,
120            columns: std::collections::HashMap::new(),
121            last_updated: instant::SystemTime::now(),
122            is_stale: true, // Mark as stale since it's a fallback
123            sample_metadata: None,
124            avg_row_bytes: None, // No actual data sampled
125        }
126    }
127
128    /// Determine optimal batch size for INSERT operations
129    ///
130    /// Uses cost estimation to decide between small and large batch sizes.
131    /// High-cost tables (many indexes) benefit from smaller batches to avoid
132    /// index maintenance overhead accumulating.
133    ///
134    /// # Arguments
135    /// * `total_rows` - Total number of rows to insert
136    ///
137    /// # Returns
138    /// Recommended batch size
139    pub fn optimal_insert_batch_size(&self, total_rows: usize) -> usize {
140        // If we don't have index info, use default large batch
141        let index_info = match &self.index_info {
142            Some(info) => info,
143            None => return thresholds::LARGE_BATCH_SIZE.min(total_rows),
144        };
145
146        // Calculate estimated cost per row for a single insert
147        let stats = self.get_stats_with_fallback();
148        let single_row_cost = self.cost_estimator.estimate_insert(1, &stats, index_info);
149
150        // Log cost for debugging (controlled by DML_COST_DEBUG env var)
151        if std::env::var("DML_COST_DEBUG").is_ok() {
152            eprintln!(
153                "DML_COST_DEBUG: INSERT on {} - cost_per_row={:.3}, hash_indexes={}, btree_indexes={}",
154                self.table_name,
155                single_row_cost,
156                index_info.hash_index_count,
157                index_info.btree_index_count
158            );
159        }
160
161        // High-cost tables benefit from smaller batches
162        if single_row_cost > thresholds::HIGH_COST_INSERT_THRESHOLD {
163            thresholds::SMALL_BATCH_SIZE.min(total_rows)
164        } else {
165            thresholds::LARGE_BATCH_SIZE.min(total_rows)
166        }
167    }
168
169    /// Determine if DELETE should be chunked to avoid long locks
170    ///
171    /// Large deletes on high-cost tables can cause long pauses due to
172    /// index maintenance. Chunking allows other operations to proceed
173    /// between chunks.
174    ///
175    /// # Arguments
176    /// * `rows_to_delete` - Number of rows to be deleted
177    ///
178    /// # Returns
179    /// `true` if delete should be chunked, `false` for single operation
180    pub fn should_chunk_delete(&self, rows_to_delete: usize) -> bool {
181        // Don't chunk small deletes
182        if rows_to_delete < thresholds::CHUNK_DELETE_ROW_THRESHOLD {
183            return false;
184        }
185
186        let index_info = match &self.index_info {
187            Some(info) => info,
188            None => return false,
189        };
190
191        // Calculate delete cost
192        let stats = self.get_stats_with_fallback();
193        let delete_cost =
194            self.cost_estimator.estimate_delete(rows_to_delete, &stats, index_info);
195
196        // Log cost for debugging
197        if std::env::var("DML_COST_DEBUG").is_ok() {
198            eprintln!(
199                "DML_COST_DEBUG: DELETE on {} - rows={}, cost={:.3}, deleted_ratio={:.2}",
200                self.table_name, rows_to_delete, delete_cost, index_info.deleted_ratio
201            );
202        }
203
204        // Chunk if:
205        // 1. Many rows AND high deleted ratio (compaction likely)
206        // 2. Many rows AND many indexes
207        let high_deleted_ratio = index_info.deleted_ratio > thresholds::HIGH_DELETED_RATIO_THRESHOLD;
208        let many_indexes = index_info.btree_index_count >= 3;
209
210        high_deleted_ratio || many_indexes
211    }
212
213    /// Get recommended chunk size for chunked deletes
214    pub fn delete_chunk_size(&self) -> usize {
215        thresholds::DELETE_CHUNK_SIZE
216    }
217
218    /// Check if early compaction should be triggered
219    ///
220    /// Based on the current deleted ratio, determines if compaction
221    /// should be triggered sooner than the default 50% threshold.
222    ///
223    /// # Returns
224    /// `true` if early compaction is recommended
225    pub fn should_trigger_early_compaction(&self) -> bool {
226        let index_info = match &self.index_info {
227            Some(info) => info,
228            None => return false,
229        };
230
231        // Consider early compaction if deleted ratio is high and we have many indexes
232        // Compaction rebuilds all indexes, so it's costly but reduces ongoing overhead
233        index_info.deleted_ratio > thresholds::HIGH_DELETED_RATIO_THRESHOLD
234            && index_info.btree_index_count >= 2
235    }
236
237    /// Compute the ratio of indexes affected by an UPDATE operation
238    ///
239    /// This is used for UPDATE cost estimation. If the update only touches
240    /// non-indexed columns, the ratio is 0.0 and index maintenance is skipped.
241    ///
242    /// # Arguments
243    /// * `changed_columns` - Set of column indices being modified
244    /// * `schema` - Table schema for looking up index column info
245    ///
246    /// # Returns
247    /// Ratio of indexes affected (0.0 to 1.0)
248    pub fn compute_indexes_affected_ratio(
249        &self,
250        changed_columns: &std::collections::HashSet<usize>,
251        schema: &vibesql_catalog::TableSchema,
252    ) -> f64 {
253        let index_info = match &self.index_info {
254            Some(info) => info,
255            None => return 0.0,
256        };
257
258        let total_indexes =
259            index_info.hash_index_count + index_info.btree_index_count;
260        if total_indexes == 0 {
261            return 0.0;
262        }
263
264        let mut affected_indexes = 0;
265
266        // Check PK (counts as 1 hash index)
267        if let Some(pk_indices) = schema.get_primary_key_indices() {
268            if pk_indices.iter().any(|i| changed_columns.contains(i)) {
269                affected_indexes += 1;
270            }
271        }
272
273        // Check unique constraints (each is a hash index)
274        for unique_cols in &schema.unique_constraints {
275            let unique_indices: Vec<usize> = unique_cols
276                .iter()
277                .filter_map(|name| schema.get_column_index(name))
278                .collect();
279            if unique_indices.iter().any(|i| changed_columns.contains(i)) {
280                affected_indexes += 1;
281            }
282        }
283
284        // For B-tree indexes, we need to check against user-defined indexes
285        // This is done at the database level via has_index_on_column
286        let changed_column_names: Vec<String> = changed_columns
287            .iter()
288            .filter_map(|&i| schema.columns.get(i).map(|c| c.name.clone()))
289            .collect();
290
291        for col_name in &changed_column_names {
292            if self.db.has_index_on_column(self.table_name, col_name) {
293                affected_indexes += 1;
294                break; // Count each B-tree index only once
295            }
296        }
297
298        affected_indexes as f64 / total_indexes as f64
299    }
300
301    /// Estimate the cost of an UPDATE operation
302    ///
303    /// # Arguments
304    /// * `row_count` - Number of rows to update
305    /// * `indexes_affected_ratio` - Ratio of indexes affected (from compute_indexes_affected_ratio)
306    ///
307    /// # Returns
308    /// Estimated cost in arbitrary units
309    pub fn estimate_update_cost(&self, row_count: usize, indexes_affected_ratio: f64) -> f64 {
310        let index_info = match &self.index_info {
311            Some(info) => info,
312            None => return 0.0,
313        };
314
315        let stats = self.get_stats_with_fallback();
316        let cost = self
317            .cost_estimator
318            .estimate_update(row_count, &stats, index_info, indexes_affected_ratio);
319
320        if std::env::var("DML_COST_DEBUG").is_ok() {
321            eprintln!(
322                "DML_COST_DEBUG: UPDATE on {} - rows={}, affected_ratio={:.2}, cost={:.3}",
323                self.table_name, row_count, indexes_affected_ratio, cost
324            );
325        }
326
327        cost
328    }
329
330    /// Get the current table index info (for external use)
331    pub fn index_info(&self) -> Option<&TableIndexInfo> {
332        self.index_info.as_ref()
333    }
334}
335
336#[cfg(test)]
337mod tests {
338    use super::*;
339    use vibesql_catalog::{ColumnSchema, TableSchema};
340    use vibesql_types::DataType;
341
342    fn create_test_db_with_table(
343        table_name: &str,
344        with_pk: bool,
345        btree_index_count: usize,
346    ) -> Database {
347        let mut db = Database::new();
348
349        let schema = if with_pk {
350            TableSchema::with_primary_key(
351                table_name.to_string(),
352                vec![
353                    ColumnSchema::new("id".to_string(), DataType::Integer, false),
354                    ColumnSchema::new("name".to_string(), DataType::Varchar { max_length: Some(100) }, false),
355                    ColumnSchema::new("value".to_string(), DataType::Integer, true),
356                ],
357                vec!["id".to_string()],
358            )
359        } else {
360            TableSchema::new(
361                table_name.to_string(),
362                vec![
363                    ColumnSchema::new("id".to_string(), DataType::Integer, false),
364                    ColumnSchema::new("name".to_string(), DataType::Varchar { max_length: Some(100) }, false),
365                ],
366            )
367        };
368        db.create_table(schema).unwrap();
369
370        // Create user-defined B-tree indexes
371        for i in 0..btree_index_count {
372            db.create_index(
373                format!("idx_{}_{}", table_name, i),
374                table_name.to_string(),
375                false,
376                vec![vibesql_ast::IndexColumn {
377                    column_name: "name".to_string(),
378                    direction: vibesql_ast::OrderDirection::Asc,
379                    prefix_length: None,
380                }],
381            ).unwrap();
382        }
383
384        db
385    }
386
387    #[test]
388    fn test_optimal_insert_batch_size_low_cost() {
389        let db = create_test_db_with_table("test_table", true, 0);
390        let optimizer = DmlOptimizer::new(&db, "test_table");
391
392        // Table with just PK should return a reasonable batch size
393        let batch_size = optimizer.optimal_insert_batch_size(5000);
394        // Should return either SMALL_BATCH_SIZE or LARGE_BATCH_SIZE
395        assert!(
396            batch_size == thresholds::SMALL_BATCH_SIZE
397                || batch_size == thresholds::LARGE_BATCH_SIZE
398        );
399    }
400
401    #[test]
402    fn test_optimal_insert_batch_size_high_cost() {
403        let db = create_test_db_with_table("test_table", true, 5);
404        let optimizer = DmlOptimizer::new(&db, "test_table");
405
406        // Table with many B-tree indexes
407        let batch_size = optimizer.optimal_insert_batch_size(5000);
408        // Should return a valid batch size
409        assert!(batch_size <= thresholds::LARGE_BATCH_SIZE);
410        assert!(batch_size >= thresholds::SMALL_BATCH_SIZE);
411    }
412
413    #[test]
414    fn test_optimal_insert_batch_size_more_indexes_smaller_batch() {
415        // Table with many indexes should have equal or smaller batch than table with few indexes
416        let db_few = create_test_db_with_table("table_few", true, 1);
417        let db_many = create_test_db_with_table("table_many", true, 5);
418
419        let optimizer_few = DmlOptimizer::new(&db_few, "table_few");
420        let optimizer_many = DmlOptimizer::new(&db_many, "table_many");
421
422        let batch_few = optimizer_few.optimal_insert_batch_size(5000);
423        let batch_many = optimizer_many.optimal_insert_batch_size(5000);
424
425        // More indexes should lead to equal or smaller batch size
426        assert!(batch_many <= batch_few);
427    }
428
429    #[test]
430    fn test_should_chunk_delete_small() {
431        let db = create_test_db_with_table("test_table", true, 0);
432        let optimizer = DmlOptimizer::new(&db, "test_table");
433
434        // Small deletes should not be chunked
435        assert!(!optimizer.should_chunk_delete(100));
436    }
437
438    #[test]
439    fn test_compute_indexes_affected_ratio_no_indexes() {
440        let db = create_test_db_with_table("test_table", false, 0);
441        let optimizer = DmlOptimizer::new(&db, "test_table");
442
443        let schema = db.catalog.get_table("test_table").unwrap();
444        let changed_columns: std::collections::HashSet<usize> = [1].into_iter().collect();
445
446        let ratio = optimizer.compute_indexes_affected_ratio(&changed_columns, schema);
447        assert_eq!(ratio, 0.0);
448    }
449
450    #[test]
451    fn test_compute_indexes_affected_ratio_pk_affected() {
452        let db = create_test_db_with_table("test_table", true, 0);
453        let optimizer = DmlOptimizer::new(&db, "test_table");
454
455        let schema = db.catalog.get_table("test_table").unwrap();
456        // Column 0 is the PK
457        let changed_columns: std::collections::HashSet<usize> = [0].into_iter().collect();
458
459        let ratio = optimizer.compute_indexes_affected_ratio(&changed_columns, schema);
460        assert!(ratio > 0.0, "PK update should affect at least one index");
461    }
462
463    #[test]
464    fn test_fallback_stats() {
465        let db = create_test_db_with_table("test_table", true, 0);
466        let optimizer = DmlOptimizer::new(&db, "test_table");
467
468        // New table won't have computed statistics
469        let stats = optimizer.get_stats_with_fallback();
470        // Should return fallback stats with row_count = 0
471        assert_eq!(stats.row_count, 0);
472    }
473
474    #[test]
475    fn test_estimate_update_cost() {
476        let db = create_test_db_with_table("test_table", true, 2);
477        let optimizer = DmlOptimizer::new(&db, "test_table");
478
479        // Full update (all indexes affected)
480        let full_cost = optimizer.estimate_update_cost(100, 1.0);
481
482        // Selective update (no indexes affected)
483        let selective_cost = optimizer.estimate_update_cost(100, 0.0);
484
485        assert!(
486            full_cost > selective_cost,
487            "Full update should cost more than selective"
488        );
489    }
490}