Skip to main content

sochdb_query/
query_optimizer.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// SochDB - LLM-Optimized Embedded Database
3// Copyright (C) 2026 Sushanth Reddy Vanagala (https://github.com/sushanthpy)
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU Affero General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU Affero General Public License for more details.
14//
15// You should have received a copy of the GNU Affero General Public License
16// along with this program. If not, see <https://www.gnu.org/licenses/>.
17
18//! Query Optimizer with Cost-Based Planning
19//!
20//! Provides cost-based query optimization for SOCH-QL.
21//! Selects index, estimates cardinality, and chooses the cheapest plan.
22
23use std::collections::HashMap;
24
25/// Query optimizer with cost-based planning
26#[derive(Debug, Default)]
27pub struct QueryOptimizer {
28    cost_model: CostModel,
29    cardinality_hints: HashMap<String, CardinalityHint>,
30    total_edges: usize,
31}
32
33/// Cost model configuration
34#[derive(Debug, Clone)]
35pub struct CostModel {
36    /// Cost of reading a single row
37    pub row_read_cost: f64,
38    /// Cost of index lookup
39    pub index_lookup_cost: f64,
40    /// Cost of sequential scan per row
41    pub seq_scan_cost: f64,
42    /// Cost of vector search per vector
43    pub vector_search_cost: f64,
44}
45
46impl Default for CostModel {
47    fn default() -> Self {
48        Self {
49            row_read_cost: 1.0,
50            index_lookup_cost: 0.5,
51            seq_scan_cost: 0.1,
52            vector_search_cost: 10.0,
53        }
54    }
55}
56
57/// Cardinality hint for a column or index
58#[derive(Debug, Clone)]
59pub struct CardinalityHint {
60    /// Estimated cardinality
61    pub cardinality: usize,
62    /// Confidence level (0-1)
63    pub confidence: f64,
64    /// Source of the estimate
65    pub source: CardinalitySource,
66}
67
68/// Source of cardinality estimate
69#[derive(Debug, Clone, Copy, PartialEq, Eq)]
70pub enum CardinalitySource {
71    /// Exact count
72    Exact,
73    /// HyperLogLog estimate
74    HyperLogLog,
75    /// Histogram estimate
76    Histogram,
77    /// Default/unknown
78    Default,
79}
80
81/// Query predicate for optimization
82#[derive(Debug, Clone)]
83pub enum QueryPredicate {
84    /// Equality predicate
85    Eq(String, String),
86    /// Range predicate
87    Range {
88        column: String,
89        min: Option<String>,
90        max: Option<String>,
91    },
92    /// In list predicate
93    In(String, Vec<String>),
94    /// Prefix match
95    Prefix(String, String),
96    /// Time range predicate
97    TimeRange(u64, u64),
98    /// Project filter
99    Project(u16),
100    /// Tenant filter
101    Tenant(u32),
102    /// Span type filter
103    SpanType(String),
104}
105
106/// Query operation types
107#[derive(Debug, Clone, PartialEq)]
108pub enum QueryOperation {
109    /// Point lookup
110    PointLookup,
111    /// Range scan
112    RangeScan,
113    /// Full scan
114    FullScan,
115    /// Vector search
116    VectorSearch { k: usize },
117    /// LSM range scan
118    LsmRangeScan { start_us: u64, end_us: u64 },
119    /// Graph traversal
120    GraphTraversal {
121        direction: TraversalDirection,
122        max_depth: usize,
123    },
124}
125
126/// Index selection recommendation
127#[derive(Debug, Clone)]
128pub enum IndexSelection {
129    /// Use primary key index
130    PrimaryKey,
131    /// Use secondary index
132    Secondary(String),
133    /// Use time-based index
134    TimeIndex,
135    /// Use vector index
136    VectorIndex,
137    /// Full table scan / LSM scan
138    FullScan,
139    /// LSM scan
140    LsmScan,
141    /// Causal index
142    CausalIndex,
143    /// Project index
144    ProjectIndex,
145    /// Multi-index intersection
146    MultiIndex(Vec<IndexSelection>),
147}
148
149/// Query cost estimate
150#[derive(Debug, Clone)]
151pub struct QueryCost {
152    /// Estimated cost
153    pub estimated_cost: f64,
154    /// Total cost (alias for compatibility)
155    pub total_cost: f64,
156    /// Estimated rows
157    pub estimated_rows: usize,
158    /// Records returned (alias)
159    pub records_returned: usize,
160    /// Selected index
161    pub index: IndexSelection,
162    /// Operation type
163    pub operation: QueryOperation,
164    /// Cost breakdown by operation
165    pub breakdown: Vec<(QueryOperation, f64)>,
166}
167
168/// Traversal direction for range queries
169#[derive(Debug, Clone, Copy, PartialEq, Eq)]
170pub enum TraversalDirection {
171    Forward,
172    Backward,
173}
174
175/// Query plan from optimizer
176#[derive(Debug, Clone)]
177pub struct QueryPlan {
178    /// Cost estimate
179    pub cost: QueryCost,
180    /// Predicates to apply
181    pub predicates: Vec<QueryPredicate>,
182    /// Traversal direction
183    pub direction: TraversalDirection,
184    /// Limit
185    pub limit: Option<usize>,
186    /// Index selection
187    pub index_selection: IndexSelection,
188    /// Operations to execute
189    pub operations: Vec<QueryOperation>,
190}
191
192impl QueryOptimizer {
193    /// Create a new query optimizer
194    pub fn new() -> Self {
195        Self::default()
196    }
197
198    /// Create with custom cost model
199    pub fn with_cost_model(cost_model: CostModel) -> Self {
200        Self {
201            cost_model,
202            ..Default::default()
203        }
204    }
205
206    /// Update total edge count for cardinality estimation
207    pub fn update_total_edges(&mut self, count: usize, _source: CardinalitySource) {
208        self.total_edges = count;
209    }
210
211    /// Set cardinality hint for a column
212    pub fn set_cardinality_hint(&mut self, column: &str, hint: CardinalityHint) {
213        self.cardinality_hints.insert(column.to_string(), hint);
214    }
215
216    /// Plan a query using cost-based index selection
217    ///
218    /// Evaluates each candidate index against a full scan, choosing the
219    /// cheapest access path based on selectivity × row count.
220    pub fn plan_query(&self, predicates: &[QueryPredicate], limit: Option<usize>) -> QueryPlan {
221        if predicates.is_empty() {
222            return self.build_plan(
223                IndexSelection::FullScan,
224                QueryOperation::FullScan,
225                predicates,
226                limit,
227            );
228        }
229
230        // Score each candidate index
231        let mut candidates: Vec<(IndexSelection, QueryOperation, f64, usize)> = Vec::new();
232
233        for pred in predicates {
234            let sel = self.estimate_selectivity(pred);
235            let est_rows = (self.total_edges as f64 * sel).max(1.0) as usize;
236
237            match pred {
238                QueryPredicate::Eq(_, _) => {
239                    let cost = self.cost_model.index_lookup_cost;
240                    candidates.push((
241                        IndexSelection::PrimaryKey,
242                        QueryOperation::PointLookup,
243                        cost,
244                        1,
245                    ));
246                }
247                QueryPredicate::TimeRange(start, end) => {
248                    let cost = est_rows as f64 * self.cost_model.row_read_cost;
249                    candidates.push((
250                        IndexSelection::TimeIndex,
251                        QueryOperation::LsmRangeScan {
252                            start_us: *start,
253                            end_us: *end,
254                        },
255                        cost,
256                        est_rows,
257                    ));
258                }
259                QueryPredicate::Range { .. } | QueryPredicate::Prefix(_, _) => {
260                    let cost = est_rows as f64 * self.cost_model.row_read_cost;
261                    candidates.push((
262                        IndexSelection::LsmScan,
263                        QueryOperation::RangeScan,
264                        cost,
265                        est_rows,
266                    ));
267                }
268                QueryPredicate::Project(_) => {
269                    let cost = est_rows as f64 * self.cost_model.index_lookup_cost;
270                    candidates.push((
271                        IndexSelection::ProjectIndex,
272                        QueryOperation::RangeScan,
273                        cost,
274                        est_rows,
275                    ));
276                }
277                QueryPredicate::Tenant(_)
278                | QueryPredicate::SpanType(_)
279                | QueryPredicate::In(_, _) => {
280                    let cost = est_rows as f64 * self.cost_model.row_read_cost;
281                    candidates.push((
282                        IndexSelection::FullScan,
283                        QueryOperation::RangeScan,
284                        cost,
285                        est_rows,
286                    ));
287                }
288            }
289        }
290
291        // Full scan baseline
292        let full_scan_cost = self.total_edges.max(1) as f64 * self.cost_model.seq_scan_cost;
293        candidates.push((
294            IndexSelection::FullScan,
295            QueryOperation::FullScan,
296            full_scan_cost,
297            self.total_edges.max(1),
298        ));
299
300        // Choose cheapest
301        candidates.sort_by(|a, b| a.2.partial_cmp(&b.2).unwrap_or(std::cmp::Ordering::Equal));
302        let (index, operation, _cost, _rows) = candidates.into_iter().next().unwrap();
303
304        self.build_plan(index, operation, predicates, limit)
305    }
306
307    /// Build a QueryPlan from chosen access path
308    fn build_plan(
309        &self,
310        index: IndexSelection,
311        operation: QueryOperation,
312        predicates: &[QueryPredicate],
313        limit: Option<usize>,
314    ) -> QueryPlan {
315        let estimated_rows = match &index {
316            IndexSelection::PrimaryKey => 1,
317            IndexSelection::FullScan => self.total_edges.max(1),
318            _ => {
319                let sel: f64 = predicates
320                    .iter()
321                    .map(|p| self.estimate_selectivity(p))
322                    .product();
323                (self.total_edges as f64 * sel).max(1.0) as usize
324            }
325        };
326
327        let estimated_cost = match &operation {
328            QueryOperation::PointLookup => self.cost_model.index_lookup_cost,
329            QueryOperation::RangeScan => estimated_rows as f64 * self.cost_model.row_read_cost,
330            QueryOperation::FullScan => self.total_edges as f64 * self.cost_model.seq_scan_cost,
331            QueryOperation::VectorSearch { .. } => self.cost_model.vector_search_cost,
332            QueryOperation::LsmRangeScan { .. } => {
333                estimated_rows as f64 * self.cost_model.row_read_cost
334            }
335            QueryOperation::GraphTraversal { .. } => {
336                estimated_rows as f64 * self.cost_model.row_read_cost
337            }
338        };
339
340        QueryPlan {
341            cost: QueryCost {
342                estimated_cost,
343                total_cost: estimated_cost,
344                estimated_rows,
345                records_returned: estimated_rows,
346                index: index.clone(),
347                operation: operation.clone(),
348                breakdown: vec![(operation.clone(), estimated_cost)],
349            },
350            predicates: predicates.to_vec(),
351            direction: TraversalDirection::Forward,
352            limit,
353            index_selection: index,
354            operations: vec![operation],
355        }
356    }
357
358    /// Estimate selectivity of a predicate
359    pub fn estimate_selectivity(&self, predicate: &QueryPredicate) -> f64 {
360        match predicate {
361            QueryPredicate::Eq(col, _) => {
362                if let Some(hint) = self.cardinality_hints.get(col) {
363                    1.0 / hint.cardinality.max(1) as f64
364                } else {
365                    0.1 // Default 10% selectivity
366                }
367            }
368            QueryPredicate::Range { .. } => 0.25, // Default 25% for range
369            QueryPredicate::In(_, values) => (values.len() as f64 * 0.1).min(0.5),
370            QueryPredicate::Prefix(_, _) => 0.15, // Default 15% for prefix
371            QueryPredicate::TimeRange(_, _) => 0.2,
372            QueryPredicate::Project(_) => 0.1,
373            QueryPredicate::Tenant(_) => 0.05,
374            QueryPredicate::SpanType(_) => 0.15,
375        }
376    }
377}