Skip to main content

velesdb_core/collection/search/query/
mod.rs

1//! VelesQL query execution for Collection.
2//!
3//! This module orchestrates query execution by combining:
4//! - Query validation (`validation.rs`)
5//! - Condition extraction (`extraction.rs`)
6//! - ORDER BY processing (`ordering.rs`)
7//!
8//! # Future Enhancement: HybridExecutionPlan Integration
9//!
10//! The `HybridExecutionPlan` and `choose_hybrid_strategy()` in `planner.rs`
11//! are ready for integration to optimize query execution based on:
12//! - Query pattern (ORDER BY similarity, filters, etc.)
13//! - Runtime statistics (latency, selectivity)
14//! - Over-fetch factor for filtered queries
15//!
16//! Future: Integrate `QueryPlanner::choose_hybrid_strategy()` into `execute_query()`
17//! to leverage cost-based optimization for complex queries.
18
19// SAFETY: Numeric casts in query execution are intentional:
20// - f64->f32 for similarity thresholds: precision loss acceptable for filtering
21// - Thresholds are approximate bounds, exact precision not required
22#![allow(clippy::cast_precision_loss)]
23#![allow(clippy::cast_sign_loss)]
24#![allow(clippy::uninlined_format_args)] // Prefer readability in query error paths.
25#![allow(clippy::implicit_hasher)] // HashSet hasher genericity adds noise for internal APIs.
26
27mod aggregation;
28mod distinct;
29mod execution_paths;
30mod extraction;
31#[cfg(test)]
32mod extraction_tests;
33pub mod join;
34#[cfg(test)]
35mod join_tests;
36pub mod match_exec;
37#[cfg(test)]
38mod match_exec_tests;
39pub mod match_metrics;
40#[cfg(test)]
41mod match_metrics_tests;
42pub mod match_planner;
43#[cfg(test)]
44mod match_planner_tests;
45mod ordering;
46pub mod parallel_traversal;
47#[cfg(test)]
48mod parallel_traversal_tests;
49pub mod pushdown;
50#[cfg(test)]
51mod pushdown_tests;
52pub mod score_fusion;
53#[cfg(test)]
54mod score_fusion_tests;
55mod similarity_filter;
56mod union_query;
57mod validation;
58mod where_eval;
59
60// Re-export for potential external use
61#[allow(unused_imports)]
62pub use ordering::compare_json_values;
63// Re-export join functions for future integration with execute_query
64#[allow(unused_imports)]
65pub use join::{execute_join, JoinedResult};
66
67use crate::collection::types::Collection;
68use crate::error::Result;
69use crate::point::SearchResult;
70use std::collections::HashSet;
71
72/// Maximum allowed LIMIT value to prevent overflow in over-fetch calculations.
73const MAX_LIMIT: usize = 100_000;
74
75impl Collection {
76    /// Executes a `VelesQL` query on this collection.
77    ///
78    /// This method unifies vector search, text search, and metadata filtering
79    /// into a single interface.
80    ///
81    /// # Arguments
82    ///
83    /// * `query` - Parsed `VelesQL` query
84    /// * `params` - Query parameters for resolving placeholders (e.g., $v)
85    ///
86    /// # Errors
87    ///
88    /// Returns an error if the query cannot be executed (e.g., missing parameters).
89    #[allow(clippy::too_many_lines)] // Complex dispatch logic - refactoring planned
90    pub fn execute_query(
91        &self,
92        query: &crate::velesql::Query,
93        params: &std::collections::HashMap<String, serde_json::Value>,
94    ) -> Result<Vec<SearchResult>> {
95        crate::velesql::QueryValidator::validate(query)
96            .map_err(|e| crate::error::Error::Query(e.to_string()))?;
97
98        // Unified VelesQL dispatch: allow Collection::execute_query() to run top-level MATCH queries.
99        if let Some(match_clause) = query.match_clause.as_ref() {
100            let mut match_results = self.execute_match(match_clause, params)?;
101
102            if let Some(order_by) = match_clause.return_clause.order_by.as_ref() {
103                for item in order_by.iter().rev() {
104                    self.order_match_results(&mut match_results, &item.expression, item.descending);
105                }
106            }
107
108            let mut results = self.match_results_to_search_results(match_results)?;
109            if let Some(limit) = match_clause.return_clause.limit {
110                let limit = usize::try_from(limit).unwrap_or(MAX_LIMIT).min(MAX_LIMIT);
111                results.truncate(limit);
112            }
113            return Ok(results);
114        }
115
116        let stmt = &query.select;
117        // Cap limit to prevent overflow in over-fetch calculations
118        let limit = usize::try_from(stmt.limit.unwrap_or(10))
119            .unwrap_or(MAX_LIMIT)
120            .min(MAX_LIMIT);
121
122        // 1. Extract vector search (NEAR) or similarity() conditions if present
123        let mut vector_search = None;
124        let mut similarity_conditions: Vec<(String, Vec<f32>, crate::velesql::CompareOp, f64)> =
125            Vec::new();
126        let mut filter_condition = None;
127        let mut graph_match_predicates = Vec::new();
128
129        // EPIC-044 US-002: Check for similarity() OR metadata pattern (union mode)
130        let is_union_query = if let Some(ref cond) = stmt.where_clause {
131            Self::has_similarity_in_problematic_or(cond)
132        } else {
133            false
134        };
135
136        // EPIC-044 US-003: Check for NOT similarity() pattern (scan mode)
137        let is_not_similarity_query = if let Some(ref cond) = stmt.where_clause {
138            Self::has_similarity_under_not(cond)
139        } else {
140            false
141        };
142
143        if let Some(ref cond) = stmt.where_clause {
144            // Validate query structure before extraction
145            Self::validate_similarity_query_structure(cond)?;
146            Self::collect_graph_match_predicates(cond, &mut graph_match_predicates);
147
148            let mut extracted_cond = cond.clone();
149            vector_search = self.extract_vector_search(&mut extracted_cond, params)?;
150            // EPIC-044 US-001: Extract ALL similarity conditions for cascade filtering
151            similarity_conditions =
152                self.extract_all_similarity_conditions(&extracted_cond, params)?;
153            filter_condition = Some(extracted_cond);
154
155            // NEAR + similarity() is supported: NEAR finds candidates, similarity() filters by threshold
156            // Multiple similarity() with AND is supported: filters applied sequentially (cascade)
157        }
158
159        // 2. Resolve WITH clause options
160        let mut ef_search = None;
161        if let Some(ref with) = stmt.with_clause {
162            ef_search = with.get_ef_search();
163        }
164
165        // Get first similarity condition for initial search (if any)
166        let first_similarity = similarity_conditions.first().cloned();
167        let has_graph_predicates = !graph_match_predicates.is_empty();
168        let skip_metadata_prefilter_for_graph_or = has_graph_predicates
169            && stmt
170                .where_clause
171                .as_ref()
172                .is_some_and(Self::condition_contains_or);
173        let execution_limit = if has_graph_predicates {
174            MAX_LIMIT
175        } else {
176            limit
177        };
178
179        // 3. Execute query based on extracted components
180        // EPIC-044 US-003: NOT similarity() requires full scan
181        if is_not_similarity_query {
182            if let Some(ref cond) = stmt.where_clause {
183                let mut results =
184                    self.execute_not_similarity_query(cond, params, execution_limit)?;
185                if has_graph_predicates {
186                    results = self.apply_where_condition_to_results(
187                        results,
188                        cond,
189                        params,
190                        stmt.from_alias.as_deref(),
191                    )?;
192                }
193
194                // Apply ORDER BY if present
195                if let Some(ref order_by) = stmt.order_by {
196                    self.apply_order_by(&mut results, order_by, params)?;
197                }
198                results.truncate(limit);
199                return Ok(results);
200            }
201        }
202
203        // EPIC-044 US-002: Union mode for similarity() OR metadata
204        if is_union_query {
205            if let Some(ref cond) = stmt.where_clause {
206                let mut results = self.execute_union_query(cond, params, execution_limit)?;
207                if has_graph_predicates {
208                    results = self.apply_where_condition_to_results(
209                        results,
210                        cond,
211                        params,
212                        stmt.from_alias.as_deref(),
213                    )?;
214                }
215
216                // Apply ORDER BY if present
217                if let Some(ref order_by) = stmt.order_by {
218                    self.apply_order_by(&mut results, order_by, params)?;
219                }
220                results.truncate(limit);
221                return Ok(results);
222            }
223        }
224
225        // EPIC-044 US-001: Support multiple similarity() with AND (cascade filtering)
226        let mut results = match (&vector_search, &first_similarity, &filter_condition) {
227            // similarity() function - use first vector to search, then filter by ALL thresholds
228            // Also apply any additional metadata filters from the WHERE clause
229            //
230            // NOTE: This uses ANN (top-K) search, not exhaustive search.
231            // Points outside the top-K window may match the threshold but won't be returned.
232            // We use a 10x over-fetch factor to reduce false negatives.
233            (None, Some((field, vec, op, threshold)), filter_cond) => {
234                // Validate field name - currently only "vector" is supported
235                if field != "vector" {
236                    return Err(crate::error::Error::Config(format!(
237                        "similarity() field '{}' not found. Only 'vector' field is supported. \
238                        Multi-vector support is planned for a future release.",
239                        field
240                    )));
241                }
242
243                // Increase over-fetch factor for multiple similarity conditions
244                let overfetch_factor = 10 * similarity_conditions.len().max(1);
245                let candidates_k = execution_limit
246                    .saturating_mul(overfetch_factor)
247                    .min(MAX_LIMIT);
248                let candidates = self.search(vec, candidates_k)?;
249
250                // EPIC-044 US-001: Apply ALL similarity filters sequentially (cascade)
251                let filter_k = execution_limit.saturating_mul(2);
252                let mut filtered =
253                    self.filter_by_similarity(candidates, field, vec, *op, *threshold, filter_k);
254
255                // Apply remaining similarity conditions (cascade filtering)
256                for (sim_field, sim_vec, sim_op, sim_threshold) in
257                    similarity_conditions.iter().skip(1)
258                {
259                    if sim_field != "vector" {
260                        return Err(crate::error::Error::Config(format!(
261                            "similarity() field '{}' not found. Only 'vector' field is supported.",
262                            sim_field
263                        )));
264                    }
265                    filtered = self.filter_by_similarity(
266                        filtered,
267                        sim_field,
268                        sim_vec,
269                        *sim_op,
270                        *sim_threshold,
271                        filter_k,
272                    );
273                }
274
275                // Then apply any additional metadata filters (e.g., AND category = 'tech')
276                if let Some(cond) = filter_cond {
277                    if skip_metadata_prefilter_for_graph_or {
278                        filtered
279                    } else {
280                        let metadata_filter = Self::extract_metadata_filter(cond);
281                        if let Some(filter_cond) = metadata_filter {
282                            let filter = crate::filter::Filter::new(
283                                crate::filter::Condition::from(filter_cond),
284                            );
285                            filtered
286                                .into_iter()
287                                .filter(|r| match r.point.payload.as_ref() {
288                                    Some(p) => filter.matches(p),
289                                    None => filter.matches(&serde_json::Value::Null),
290                                })
291                                .take(execution_limit)
292                                .collect()
293                        } else {
294                            filtered
295                        }
296                    }
297                } else {
298                    filtered
299                }
300            }
301            // NEAR + similarity() + optional metadata: find candidates, then filter by ALL thresholds
302            // Pattern: "Find top-k neighbors AND keep only those matching ALL similarity conditions"
303            (Some(vector), Some((field, sim_vec, op, threshold)), filter_cond) => {
304                // Validate field name - currently only "vector" is supported
305                if field != "vector" {
306                    return Err(crate::error::Error::Config(format!(
307                        "similarity() field '{}' not found. Only 'vector' field is supported. \
308                        Multi-vector support is planned for a future release.",
309                        field
310                    )));
311                }
312
313                // 1. NEAR finds candidates (overfetch for filtering headroom)
314                let overfetch_factor = 10 * similarity_conditions.len().max(1);
315                let candidates_k = execution_limit
316                    .saturating_mul(overfetch_factor)
317                    .min(MAX_LIMIT);
318                let candidates = self.search(vector, candidates_k)?;
319
320                // 2. EPIC-044 US-001: Apply ALL similarity filters sequentially (cascade)
321                let filter_k = execution_limit.saturating_mul(2);
322                let mut filtered = self
323                    .filter_by_similarity(candidates, field, sim_vec, *op, *threshold, filter_k);
324
325                // Apply remaining similarity conditions
326                for (sim_field, sim_vec, sim_op, sim_threshold) in
327                    similarity_conditions.iter().skip(1)
328                {
329                    if sim_field != "vector" {
330                        return Err(crate::error::Error::Config(format!(
331                            "similarity() field '{}' not found. Only 'vector' field is supported.",
332                            sim_field
333                        )));
334                    }
335                    filtered = self.filter_by_similarity(
336                        filtered,
337                        sim_field,
338                        sim_vec,
339                        *sim_op,
340                        *sim_threshold,
341                        filter_k,
342                    );
343                }
344
345                // 3. Apply additional metadata filters if present
346                if let Some(cond) = filter_cond {
347                    if skip_metadata_prefilter_for_graph_or {
348                        filtered
349                    } else {
350                        let metadata_filter = Self::extract_metadata_filter(cond);
351                        if let Some(filter_cond) = metadata_filter {
352                            let filter = crate::filter::Filter::new(
353                                crate::filter::Condition::from(filter_cond),
354                            );
355                            filtered
356                                .into_iter()
357                                .filter(|r| match r.point.payload.as_ref() {
358                                    Some(p) => filter.matches(p),
359                                    None => filter.matches(&serde_json::Value::Null),
360                                })
361                                .take(execution_limit)
362                                .collect()
363                        } else {
364                            filtered
365                        }
366                    }
367                } else {
368                    filtered
369                }
370            }
371            (Some(vector), None, Some(ref cond)) => {
372                // Check if condition contains MATCH for hybrid search
373                if let Some(text_query) = Self::extract_match_query(cond) {
374                    // Hybrid search: NEAR + MATCH
375                    self.hybrid_search(vector, &text_query, execution_limit, None)?
376                } else {
377                    // Vector search with metadata filter (graph predicates handled separately)
378                    if skip_metadata_prefilter_for_graph_or {
379                        self.search(vector, execution_limit)?
380                    } else if let Some(metadata_cond) = Self::extract_metadata_filter(cond) {
381                        let filter = crate::filter::Filter::new(crate::filter::Condition::from(
382                            metadata_cond,
383                        ));
384                        self.search_with_filter(vector, execution_limit, &filter)?
385                    } else {
386                        self.search(vector, execution_limit)?
387                    }
388                }
389            }
390            (Some(vector), _, None) => {
391                // Pure vector search
392                if let Some(ef) = ef_search {
393                    self.search_with_ef(vector, execution_limit, ef)?
394                } else {
395                    self.search(vector, execution_limit)?
396                }
397            }
398            (None, None, Some(ref cond)) => {
399                // Metadata-only filter (table scan + filter)
400                // If it's a MATCH condition, use text search
401                if let crate::velesql::Condition::Match(ref m) = cond {
402                    // Pure text search - no filter needed
403                    self.text_search(&m.query, execution_limit)
404                } else {
405                    // Generic metadata filter with optional secondary index acceleration.
406                    // If condition only contains graph predicates, scan all then graph-filter.
407                    if skip_metadata_prefilter_for_graph_or {
408                        self.execute_scan_query(
409                            &crate::filter::Filter::new(crate::filter::Condition::And {
410                                conditions: vec![],
411                            }),
412                            execution_limit,
413                        )
414                    } else if let Some(metadata_cond) = Self::extract_metadata_filter(cond) {
415                        if let Some(indexed_results) =
416                            self.execute_indexed_metadata_query(&metadata_cond, execution_limit)
417                        {
418                            indexed_results
419                        } else {
420                            let filter = crate::filter::Filter::new(
421                                crate::filter::Condition::from(metadata_cond),
422                            );
423                            self.execute_scan_query(&filter, execution_limit)
424                        }
425                    } else {
426                        self.execute_scan_query(
427                            &crate::filter::Filter::new(crate::filter::Condition::And {
428                                conditions: vec![],
429                            }),
430                            execution_limit,
431                        )
432                    }
433                }
434            }
435            (None, None, None) => {
436                // SELECT * FROM docs LIMIT N (no WHERE)
437                self.execute_scan_query(
438                    &crate::filter::Filter::new(crate::filter::Condition::And {
439                        conditions: vec![],
440                    }),
441                    execution_limit,
442                )
443            }
444        };
445
446        if has_graph_predicates {
447            if let Some(cond) = stmt.where_clause.as_ref() {
448                results = self.apply_where_condition_to_results(
449                    results,
450                    cond,
451                    params,
452                    stmt.from_alias.as_deref(),
453                )?;
454            }
455        }
456
457        // EPIC-052 US-001: Apply DISTINCT deduplication if requested
458        if stmt.distinct == crate::velesql::DistinctMode::All {
459            results = distinct::apply_distinct(results, &stmt.columns);
460        }
461
462        // Apply ORDER BY if present
463        if let Some(ref order_by) = stmt.order_by {
464            self.apply_order_by(&mut results, order_by, params)?;
465        }
466
467        // Apply limit
468        results.truncate(limit);
469
470        Ok(results)
471    }
472
473    // NOTE: apply_distinct and compute_distinct_key moved to distinct.rs
474    // (EPIC-061/US-003 refactoring)
475
476    // NOTE: filter_by_similarity, execute_not_similarity_query, extract_not_similarity_condition,
477    // execute_scan_query moved to similarity_filter.rs (Plan 04-04)
478
479    // NOTE: execute_union_query, matches_metadata_filter, split_or_condition_with_outer_filter
480    // moved to union_query.rs (Plan 04-04)
481}