Skip to main content

sochdb_query/
query_optimizer.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// SochDB - LLM-Optimized Embedded Database
3// Copyright (C) 2026 Sushanth Reddy Vanagala (https://github.com/sushanthpy)
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU Affero General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU Affero General Public License for more details.
14//
15// You should have received a copy of the GNU Affero General Public License
16// along with this program. If not, see <https://www.gnu.org/licenses/>.
17
18//! Query Optimizer with Cost-Based Planning
19//!
20//! Provides cost-based query optimization for SOCH-QL.
21//! Selects index, estimates cardinality, and chooses the cheapest plan.
22
23use std::collections::HashMap;
24
25/// Query optimizer with cost-based planning
26#[derive(Debug, Default)]
27pub struct QueryOptimizer {
28    cost_model: CostModel,
29    cardinality_hints: HashMap<String, CardinalityHint>,
30    total_edges: usize,
31}
32
33/// Cost model configuration
34#[derive(Debug, Clone)]
35pub struct CostModel {
36    /// Cost of reading a single row
37    pub row_read_cost: f64,
38    /// Cost of index lookup
39    pub index_lookup_cost: f64,
40    /// Cost of sequential scan per row
41    pub seq_scan_cost: f64,
42    /// Cost of vector search per vector
43    pub vector_search_cost: f64,
44}
45
46impl Default for CostModel {
47    fn default() -> Self {
48        Self {
49            row_read_cost: 1.0,
50            index_lookup_cost: 0.5,
51            seq_scan_cost: 0.1,
52            vector_search_cost: 10.0,
53        }
54    }
55}
56
57/// Cardinality hint for a column or index
58#[derive(Debug, Clone)]
59pub struct CardinalityHint {
60    /// Estimated cardinality
61    pub cardinality: usize,
62    /// Confidence level (0-1)
63    pub confidence: f64,
64    /// Source of the estimate
65    pub source: CardinalitySource,
66}
67
68/// Source of cardinality estimate
69#[derive(Debug, Clone, Copy, PartialEq, Eq)]
70pub enum CardinalitySource {
71    /// Exact count
72    Exact,
73    /// HyperLogLog estimate
74    HyperLogLog,
75    /// Histogram estimate
76    Histogram,
77    /// Default/unknown
78    Default,
79}
80
81/// Query predicate for optimization
82#[derive(Debug, Clone)]
83pub enum QueryPredicate {
84    /// Equality predicate
85    Eq(String, String),
86    /// Range predicate
87    Range {
88        column: String,
89        min: Option<String>,
90        max: Option<String>,
91    },
92    /// In list predicate
93    In(String, Vec<String>),
94    /// Prefix match
95    Prefix(String, String),
96    /// Time range predicate
97    TimeRange(u64, u64),
98    /// Project filter
99    Project(u16),
100    /// Tenant filter
101    Tenant(u32),
102    /// Span type filter
103    SpanType(String),
104}
105
106/// Query operation types
107#[derive(Debug, Clone, PartialEq)]
108pub enum QueryOperation {
109    /// Point lookup
110    PointLookup,
111    /// Range scan
112    RangeScan,
113    /// Full scan
114    FullScan,
115    /// Vector search
116    VectorSearch { k: usize },
117    /// LSM range scan
118    LsmRangeScan { start_us: u64, end_us: u64 },
119    /// Graph traversal
120    GraphTraversal {
121        direction: TraversalDirection,
122        max_depth: usize,
123    },
124}
125
126/// Index selection recommendation
127#[derive(Debug, Clone)]
128pub enum IndexSelection {
129    /// Use primary key index
130    PrimaryKey,
131    /// Use secondary index
132    Secondary(String),
133    /// Use time-based index
134    TimeIndex,
135    /// Use vector index
136    VectorIndex,
137    /// Full table scan / LSM scan
138    FullScan,
139    /// LSM scan
140    LsmScan,
141    /// Causal index
142    CausalIndex,
143    /// Project index
144    ProjectIndex,
145    /// Multi-index intersection
146    MultiIndex(Vec<IndexSelection>),
147}
148
149/// Query cost estimate
150#[derive(Debug, Clone)]
151pub struct QueryCost {
152    /// Estimated cost
153    pub estimated_cost: f64,
154    /// Total cost (alias for compatibility)
155    pub total_cost: f64,
156    /// Estimated rows
157    pub estimated_rows: usize,
158    /// Records returned (alias)
159    pub records_returned: usize,
160    /// Selected index
161    pub index: IndexSelection,
162    /// Operation type
163    pub operation: QueryOperation,
164    /// Cost breakdown by operation
165    pub breakdown: Vec<(QueryOperation, f64)>,
166}
167
168/// Traversal direction for range queries
169#[derive(Debug, Clone, Copy, PartialEq, Eq)]
170pub enum TraversalDirection {
171    Forward,
172    Backward,
173}
174
175/// Query plan from optimizer
176#[derive(Debug, Clone)]
177pub struct QueryPlan {
178    /// Cost estimate
179    pub cost: QueryCost,
180    /// Predicates to apply
181    pub predicates: Vec<QueryPredicate>,
182    /// Traversal direction
183    pub direction: TraversalDirection,
184    /// Limit
185    pub limit: Option<usize>,
186    /// Index selection
187    pub index_selection: IndexSelection,
188    /// Operations to execute
189    pub operations: Vec<QueryOperation>,
190}
191
192impl QueryOptimizer {
193    /// Create a new query optimizer
194    pub fn new() -> Self {
195        Self::default()
196    }
197
198    /// Create with custom cost model
199    pub fn with_cost_model(cost_model: CostModel) -> Self {
200        Self {
201            cost_model,
202            ..Default::default()
203        }
204    }
205
206    /// Update total edge count for cardinality estimation
207    pub fn update_total_edges(&mut self, count: usize, _source: CardinalitySource) {
208        self.total_edges = count;
209    }
210
211    /// Set cardinality hint for a column
212    pub fn set_cardinality_hint(&mut self, column: &str, hint: CardinalityHint) {
213        self.cardinality_hints.insert(column.to_string(), hint);
214    }
215
216    /// Plan a query using cost-based index selection
217    ///
218    /// Evaluates each candidate index against a full scan, choosing the
219    /// cheapest access path based on selectivity × row count.
220    pub fn plan_query(&self, predicates: &[QueryPredicate], limit: Option<usize>) -> QueryPlan {
221        if predicates.is_empty() {
222            return self.build_plan(
223                IndexSelection::FullScan,
224                QueryOperation::FullScan,
225                predicates,
226                limit,
227            );
228        }
229
230        // Score each candidate index
231        let mut candidates: Vec<(IndexSelection, QueryOperation, f64, usize)> = Vec::new();
232
233        for pred in predicates {
234            let sel = self.estimate_selectivity(pred);
235            let est_rows = (self.total_edges as f64 * sel).max(1.0) as usize;
236
237            match pred {
238                QueryPredicate::Eq(_, _) => {
239                    let cost = self.cost_model.index_lookup_cost;
240                    candidates.push((IndexSelection::PrimaryKey, QueryOperation::PointLookup, cost, 1));
241                }
242                QueryPredicate::TimeRange(start, end) => {
243                    let cost = est_rows as f64 * self.cost_model.row_read_cost;
244                    candidates.push((
245                        IndexSelection::TimeIndex,
246                        QueryOperation::LsmRangeScan {
247                            start_us: *start,
248                            end_us: *end,
249                        },
250                        cost,
251                        est_rows,
252                    ));
253                }
254                QueryPredicate::Range { .. } | QueryPredicate::Prefix(_, _) => {
255                    let cost = est_rows as f64 * self.cost_model.row_read_cost;
256                    candidates.push((
257                        IndexSelection::LsmScan,
258                        QueryOperation::RangeScan,
259                        cost,
260                        est_rows,
261                    ));
262                }
263                QueryPredicate::Project(_) => {
264                    let cost = est_rows as f64 * self.cost_model.index_lookup_cost;
265                    candidates.push((
266                        IndexSelection::ProjectIndex,
267                        QueryOperation::RangeScan,
268                        cost,
269                        est_rows,
270                    ));
271                }
272                QueryPredicate::Tenant(_) | QueryPredicate::SpanType(_) | QueryPredicate::In(_, _) => {
273                    let cost = est_rows as f64 * self.cost_model.row_read_cost;
274                    candidates.push((
275                        IndexSelection::FullScan,
276                        QueryOperation::RangeScan,
277                        cost,
278                        est_rows,
279                    ));
280                }
281            }
282        }
283
284        // Full scan baseline
285        let full_scan_cost = self.total_edges.max(1) as f64 * self.cost_model.seq_scan_cost;
286        candidates.push((
287            IndexSelection::FullScan,
288            QueryOperation::FullScan,
289            full_scan_cost,
290            self.total_edges.max(1),
291        ));
292
293        // Choose cheapest
294        candidates.sort_by(|a, b| a.2.partial_cmp(&b.2).unwrap_or(std::cmp::Ordering::Equal));
295        let (index, operation, _cost, _rows) = candidates.into_iter().next().unwrap();
296
297        self.build_plan(index, operation, predicates, limit)
298    }
299
300    /// Build a QueryPlan from chosen access path
301    fn build_plan(
302        &self,
303        index: IndexSelection,
304        operation: QueryOperation,
305        predicates: &[QueryPredicate],
306        limit: Option<usize>,
307    ) -> QueryPlan {
308        let estimated_rows = match &index {
309            IndexSelection::PrimaryKey => 1,
310            IndexSelection::FullScan => self.total_edges.max(1),
311            _ => {
312                let sel: f64 = predicates
313                    .iter()
314                    .map(|p| self.estimate_selectivity(p))
315                    .product();
316                (self.total_edges as f64 * sel).max(1.0) as usize
317            }
318        };
319
320        let estimated_cost = match &operation {
321            QueryOperation::PointLookup => self.cost_model.index_lookup_cost,
322            QueryOperation::RangeScan => estimated_rows as f64 * self.cost_model.row_read_cost,
323            QueryOperation::FullScan => self.total_edges as f64 * self.cost_model.seq_scan_cost,
324            QueryOperation::VectorSearch { .. } => self.cost_model.vector_search_cost,
325            QueryOperation::LsmRangeScan { .. } => estimated_rows as f64 * self.cost_model.row_read_cost,
326            QueryOperation::GraphTraversal { .. } => estimated_rows as f64 * self.cost_model.row_read_cost,
327        };
328
329        QueryPlan {
330            cost: QueryCost {
331                estimated_cost,
332                total_cost: estimated_cost,
333                estimated_rows,
334                records_returned: estimated_rows,
335                index: index.clone(),
336                operation: operation.clone(),
337                breakdown: vec![(operation.clone(), estimated_cost)],
338            },
339            predicates: predicates.to_vec(),
340            direction: TraversalDirection::Forward,
341            limit,
342            index_selection: index,
343            operations: vec![operation],
344        }
345    }
346
347    /// Estimate selectivity of a predicate
348    pub fn estimate_selectivity(&self, predicate: &QueryPredicate) -> f64 {
349        match predicate {
350            QueryPredicate::Eq(col, _) => {
351                if let Some(hint) = self.cardinality_hints.get(col) {
352                    1.0 / hint.cardinality.max(1) as f64
353                } else {
354                    0.1 // Default 10% selectivity
355                }
356            }
357            QueryPredicate::Range { .. } => 0.25, // Default 25% for range
358            QueryPredicate::In(_, values) => (values.len() as f64 * 0.1).min(0.5),
359            QueryPredicate::Prefix(_, _) => 0.15, // Default 15% for prefix
360            QueryPredicate::TimeRange(_, _) => 0.2,
361            QueryPredicate::Project(_) => 0.1,
362            QueryPredicate::Tenant(_) => 0.05,
363            QueryPredicate::SpanType(_) => 0.15,
364        }
365    }
366}