velesdb_core/collection/search/query/mod.rs
1//! VelesQL query execution for Collection.
2//!
3//! This module orchestrates query execution by combining:
4//! - Query validation (`validation.rs`)
5//! - Condition extraction (`extraction.rs`)
6//! - ORDER BY processing (`ordering.rs`)
7//!
8//! # Future Enhancement: HybridExecutionPlan Integration
9//!
10//! The `HybridExecutionPlan` and `choose_hybrid_strategy()` in `planner.rs`
11//! are ready for integration to optimize query execution based on:
12//! - Query pattern (ORDER BY similarity, filters, etc.)
13//! - Runtime statistics (latency, selectivity)
14//! - Over-fetch factor for filtered queries
15//!
16//! Future: Integrate `QueryPlanner::choose_hybrid_strategy()` into `execute_query()`
17//! to leverage cost-based optimization for complex queries.
18
19// SAFETY: Numeric casts in query execution are intentional:
20// - f64->f32 for similarity thresholds: precision loss acceptable for filtering
21// - Thresholds are approximate bounds, exact precision not required
22#![allow(clippy::cast_precision_loss)]
23#![allow(clippy::cast_sign_loss)]
24#![allow(clippy::uninlined_format_args)] // Prefer readability in query error paths.
25#![allow(clippy::implicit_hasher)] // HashSet hasher genericity adds noise for internal APIs.
26
27mod aggregation;
28mod distinct;
29mod execution_paths;
30mod extraction;
31#[cfg(test)]
32mod extraction_tests;
33pub mod join;
34#[cfg(test)]
35mod join_tests;
36pub mod match_exec;
37#[cfg(test)]
38mod match_exec_tests;
39pub mod match_metrics;
40#[cfg(test)]
41mod match_metrics_tests;
42pub mod match_planner;
43#[cfg(test)]
44mod match_planner_tests;
45mod ordering;
46pub mod parallel_traversal;
47#[cfg(test)]
48mod parallel_traversal_tests;
49pub mod pushdown;
50#[cfg(test)]
51mod pushdown_tests;
52pub mod score_fusion;
53#[cfg(test)]
54mod score_fusion_tests;
55mod similarity_filter;
56mod union_query;
57mod validation;
58mod where_eval;
59
60// Re-export for potential external use
61#[allow(unused_imports)]
62pub use ordering::compare_json_values;
63// Re-export join functions for future integration with execute_query
64#[allow(unused_imports)]
65pub use join::{execute_join, JoinedResult};
66
67use crate::collection::types::Collection;
68use crate::error::Result;
69use crate::point::SearchResult;
70use std::collections::HashSet;
71
72/// Maximum allowed LIMIT value to prevent overflow in over-fetch calculations.
73const MAX_LIMIT: usize = 100_000;
74
75impl Collection {
76 /// Executes a `VelesQL` query on this collection.
77 ///
78 /// This method unifies vector search, text search, and metadata filtering
79 /// into a single interface.
80 ///
81 /// # Arguments
82 ///
83 /// * `query` - Parsed `VelesQL` query
84 /// * `params` - Query parameters for resolving placeholders (e.g., $v)
85 ///
86 /// # Errors
87 ///
88 /// Returns an error if the query cannot be executed (e.g., missing parameters).
89 #[allow(clippy::too_many_lines)] // Complex dispatch logic - refactoring planned
90 pub fn execute_query(
91 &self,
92 query: &crate::velesql::Query,
93 params: &std::collections::HashMap<String, serde_json::Value>,
94 ) -> Result<Vec<SearchResult>> {
95 crate::velesql::QueryValidator::validate(query)
96 .map_err(|e| crate::error::Error::Query(e.to_string()))?;
97
98 // Unified VelesQL dispatch: allow Collection::execute_query() to run top-level MATCH queries.
99 if let Some(match_clause) = query.match_clause.as_ref() {
100 let mut match_results = self.execute_match(match_clause, params)?;
101
102 if let Some(order_by) = match_clause.return_clause.order_by.as_ref() {
103 for item in order_by.iter().rev() {
104 self.order_match_results(&mut match_results, &item.expression, item.descending);
105 }
106 }
107
108 let mut results = self.match_results_to_search_results(match_results)?;
109 if let Some(limit) = match_clause.return_clause.limit {
110 let limit = usize::try_from(limit).unwrap_or(MAX_LIMIT).min(MAX_LIMIT);
111 results.truncate(limit);
112 }
113 return Ok(results);
114 }
115
116 let stmt = &query.select;
117 // Cap limit to prevent overflow in over-fetch calculations
118 let limit = usize::try_from(stmt.limit.unwrap_or(10))
119 .unwrap_or(MAX_LIMIT)
120 .min(MAX_LIMIT);
121
122 // 1. Extract vector search (NEAR) or similarity() conditions if present
123 let mut vector_search = None;
124 let mut similarity_conditions: Vec<(String, Vec<f32>, crate::velesql::CompareOp, f64)> =
125 Vec::new();
126 let mut filter_condition = None;
127 let mut graph_match_predicates = Vec::new();
128
129 // EPIC-044 US-002: Check for similarity() OR metadata pattern (union mode)
130 let is_union_query = if let Some(ref cond) = stmt.where_clause {
131 Self::has_similarity_in_problematic_or(cond)
132 } else {
133 false
134 };
135
136 // EPIC-044 US-003: Check for NOT similarity() pattern (scan mode)
137 let is_not_similarity_query = if let Some(ref cond) = stmt.where_clause {
138 Self::has_similarity_under_not(cond)
139 } else {
140 false
141 };
142
143 if let Some(ref cond) = stmt.where_clause {
144 // Validate query structure before extraction
145 Self::validate_similarity_query_structure(cond)?;
146 Self::collect_graph_match_predicates(cond, &mut graph_match_predicates);
147
148 let mut extracted_cond = cond.clone();
149 vector_search = self.extract_vector_search(&mut extracted_cond, params)?;
150 // EPIC-044 US-001: Extract ALL similarity conditions for cascade filtering
151 similarity_conditions =
152 self.extract_all_similarity_conditions(&extracted_cond, params)?;
153 filter_condition = Some(extracted_cond);
154
155 // NEAR + similarity() is supported: NEAR finds candidates, similarity() filters by threshold
156 // Multiple similarity() with AND is supported: filters applied sequentially (cascade)
157 }
158
159 // 2. Resolve WITH clause options
160 let mut ef_search = None;
161 if let Some(ref with) = stmt.with_clause {
162 ef_search = with.get_ef_search();
163 }
164
165 // Get first similarity condition for initial search (if any)
166 let first_similarity = similarity_conditions.first().cloned();
167 let has_graph_predicates = !graph_match_predicates.is_empty();
168 let skip_metadata_prefilter_for_graph_or = has_graph_predicates
169 && stmt
170 .where_clause
171 .as_ref()
172 .is_some_and(Self::condition_contains_or);
173 let execution_limit = if has_graph_predicates {
174 MAX_LIMIT
175 } else {
176 limit
177 };
178
179 // 3. Execute query based on extracted components
180 // EPIC-044 US-003: NOT similarity() requires full scan
181 if is_not_similarity_query {
182 if let Some(ref cond) = stmt.where_clause {
183 let mut results =
184 self.execute_not_similarity_query(cond, params, execution_limit)?;
185 if has_graph_predicates {
186 results = self.apply_where_condition_to_results(
187 results,
188 cond,
189 params,
190 stmt.from_alias.as_deref(),
191 )?;
192 }
193
194 // Apply ORDER BY if present
195 if let Some(ref order_by) = stmt.order_by {
196 self.apply_order_by(&mut results, order_by, params)?;
197 }
198 results.truncate(limit);
199 return Ok(results);
200 }
201 }
202
203 // EPIC-044 US-002: Union mode for similarity() OR metadata
204 if is_union_query {
205 if let Some(ref cond) = stmt.where_clause {
206 let mut results = self.execute_union_query(cond, params, execution_limit)?;
207 if has_graph_predicates {
208 results = self.apply_where_condition_to_results(
209 results,
210 cond,
211 params,
212 stmt.from_alias.as_deref(),
213 )?;
214 }
215
216 // Apply ORDER BY if present
217 if let Some(ref order_by) = stmt.order_by {
218 self.apply_order_by(&mut results, order_by, params)?;
219 }
220 results.truncate(limit);
221 return Ok(results);
222 }
223 }
224
225 // EPIC-044 US-001: Support multiple similarity() with AND (cascade filtering)
226 let mut results = match (&vector_search, &first_similarity, &filter_condition) {
227 // similarity() function - use first vector to search, then filter by ALL thresholds
228 // Also apply any additional metadata filters from the WHERE clause
229 //
230 // NOTE: This uses ANN (top-K) search, not exhaustive search.
231 // Points outside the top-K window may match the threshold but won't be returned.
232 // We use a 10x over-fetch factor to reduce false negatives.
233 (None, Some((field, vec, op, threshold)), filter_cond) => {
234 // Validate field name - currently only "vector" is supported
235 if field != "vector" {
236 return Err(crate::error::Error::Config(format!(
237 "similarity() field '{}' not found. Only 'vector' field is supported. \
238 Multi-vector support is planned for a future release.",
239 field
240 )));
241 }
242
243 // Increase over-fetch factor for multiple similarity conditions
244 let overfetch_factor = 10 * similarity_conditions.len().max(1);
245 let candidates_k = execution_limit
246 .saturating_mul(overfetch_factor)
247 .min(MAX_LIMIT);
248 let candidates = self.search(vec, candidates_k)?;
249
250 // EPIC-044 US-001: Apply ALL similarity filters sequentially (cascade)
251 let filter_k = execution_limit.saturating_mul(2);
252 let mut filtered =
253 self.filter_by_similarity(candidates, field, vec, *op, *threshold, filter_k);
254
255 // Apply remaining similarity conditions (cascade filtering)
256 for (sim_field, sim_vec, sim_op, sim_threshold) in
257 similarity_conditions.iter().skip(1)
258 {
259 if sim_field != "vector" {
260 return Err(crate::error::Error::Config(format!(
261 "similarity() field '{}' not found. Only 'vector' field is supported.",
262 sim_field
263 )));
264 }
265 filtered = self.filter_by_similarity(
266 filtered,
267 sim_field,
268 sim_vec,
269 *sim_op,
270 *sim_threshold,
271 filter_k,
272 );
273 }
274
275 // Then apply any additional metadata filters (e.g., AND category = 'tech')
276 if let Some(cond) = filter_cond {
277 if skip_metadata_prefilter_for_graph_or {
278 filtered
279 } else {
280 let metadata_filter = Self::extract_metadata_filter(cond);
281 if let Some(filter_cond) = metadata_filter {
282 let filter = crate::filter::Filter::new(
283 crate::filter::Condition::from(filter_cond),
284 );
285 filtered
286 .into_iter()
287 .filter(|r| match r.point.payload.as_ref() {
288 Some(p) => filter.matches(p),
289 None => filter.matches(&serde_json::Value::Null),
290 })
291 .take(execution_limit)
292 .collect()
293 } else {
294 filtered
295 }
296 }
297 } else {
298 filtered
299 }
300 }
301 // NEAR + similarity() + optional metadata: find candidates, then filter by ALL thresholds
302 // Pattern: "Find top-k neighbors AND keep only those matching ALL similarity conditions"
303 (Some(vector), Some((field, sim_vec, op, threshold)), filter_cond) => {
304 // Validate field name - currently only "vector" is supported
305 if field != "vector" {
306 return Err(crate::error::Error::Config(format!(
307 "similarity() field '{}' not found. Only 'vector' field is supported. \
308 Multi-vector support is planned for a future release.",
309 field
310 )));
311 }
312
313 // 1. NEAR finds candidates (overfetch for filtering headroom)
314 let overfetch_factor = 10 * similarity_conditions.len().max(1);
315 let candidates_k = execution_limit
316 .saturating_mul(overfetch_factor)
317 .min(MAX_LIMIT);
318 let candidates = self.search(vector, candidates_k)?;
319
320 // 2. EPIC-044 US-001: Apply ALL similarity filters sequentially (cascade)
321 let filter_k = execution_limit.saturating_mul(2);
322 let mut filtered = self
323 .filter_by_similarity(candidates, field, sim_vec, *op, *threshold, filter_k);
324
325 // Apply remaining similarity conditions
326 for (sim_field, sim_vec, sim_op, sim_threshold) in
327 similarity_conditions.iter().skip(1)
328 {
329 if sim_field != "vector" {
330 return Err(crate::error::Error::Config(format!(
331 "similarity() field '{}' not found. Only 'vector' field is supported.",
332 sim_field
333 )));
334 }
335 filtered = self.filter_by_similarity(
336 filtered,
337 sim_field,
338 sim_vec,
339 *sim_op,
340 *sim_threshold,
341 filter_k,
342 );
343 }
344
345 // 3. Apply additional metadata filters if present
346 if let Some(cond) = filter_cond {
347 if skip_metadata_prefilter_for_graph_or {
348 filtered
349 } else {
350 let metadata_filter = Self::extract_metadata_filter(cond);
351 if let Some(filter_cond) = metadata_filter {
352 let filter = crate::filter::Filter::new(
353 crate::filter::Condition::from(filter_cond),
354 );
355 filtered
356 .into_iter()
357 .filter(|r| match r.point.payload.as_ref() {
358 Some(p) => filter.matches(p),
359 None => filter.matches(&serde_json::Value::Null),
360 })
361 .take(execution_limit)
362 .collect()
363 } else {
364 filtered
365 }
366 }
367 } else {
368 filtered
369 }
370 }
371 (Some(vector), None, Some(ref cond)) => {
372 // Check if condition contains MATCH for hybrid search
373 if let Some(text_query) = Self::extract_match_query(cond) {
374 // Hybrid search: NEAR + MATCH
375 self.hybrid_search(vector, &text_query, execution_limit, None)?
376 } else {
377 // Vector search with metadata filter (graph predicates handled separately)
378 if skip_metadata_prefilter_for_graph_or {
379 self.search(vector, execution_limit)?
380 } else if let Some(metadata_cond) = Self::extract_metadata_filter(cond) {
381 let filter = crate::filter::Filter::new(crate::filter::Condition::from(
382 metadata_cond,
383 ));
384 self.search_with_filter(vector, execution_limit, &filter)?
385 } else {
386 self.search(vector, execution_limit)?
387 }
388 }
389 }
390 (Some(vector), _, None) => {
391 // Pure vector search
392 if let Some(ef) = ef_search {
393 self.search_with_ef(vector, execution_limit, ef)?
394 } else {
395 self.search(vector, execution_limit)?
396 }
397 }
398 (None, None, Some(ref cond)) => {
399 // Metadata-only filter (table scan + filter)
400 // If it's a MATCH condition, use text search
401 if let crate::velesql::Condition::Match(ref m) = cond {
402 // Pure text search - no filter needed
403 self.text_search(&m.query, execution_limit)
404 } else {
405 // Generic metadata filter with optional secondary index acceleration.
406 // If condition only contains graph predicates, scan all then graph-filter.
407 if skip_metadata_prefilter_for_graph_or {
408 self.execute_scan_query(
409 &crate::filter::Filter::new(crate::filter::Condition::And {
410 conditions: vec![],
411 }),
412 execution_limit,
413 )
414 } else if let Some(metadata_cond) = Self::extract_metadata_filter(cond) {
415 if let Some(indexed_results) =
416 self.execute_indexed_metadata_query(&metadata_cond, execution_limit)
417 {
418 indexed_results
419 } else {
420 let filter = crate::filter::Filter::new(
421 crate::filter::Condition::from(metadata_cond),
422 );
423 self.execute_scan_query(&filter, execution_limit)
424 }
425 } else {
426 self.execute_scan_query(
427 &crate::filter::Filter::new(crate::filter::Condition::And {
428 conditions: vec![],
429 }),
430 execution_limit,
431 )
432 }
433 }
434 }
435 (None, None, None) => {
436 // SELECT * FROM docs LIMIT N (no WHERE)
437 self.execute_scan_query(
438 &crate::filter::Filter::new(crate::filter::Condition::And {
439 conditions: vec![],
440 }),
441 execution_limit,
442 )
443 }
444 };
445
446 if has_graph_predicates {
447 if let Some(cond) = stmt.where_clause.as_ref() {
448 results = self.apply_where_condition_to_results(
449 results,
450 cond,
451 params,
452 stmt.from_alias.as_deref(),
453 )?;
454 }
455 }
456
457 // EPIC-052 US-001: Apply DISTINCT deduplication if requested
458 if stmt.distinct == crate::velesql::DistinctMode::All {
459 results = distinct::apply_distinct(results, &stmt.columns);
460 }
461
462 // Apply ORDER BY if present
463 if let Some(ref order_by) = stmt.order_by {
464 self.apply_order_by(&mut results, order_by, params)?;
465 }
466
467 // Apply limit
468 results.truncate(limit);
469
470 Ok(results)
471 }
472
473 // NOTE: apply_distinct and compute_distinct_key moved to distinct.rs
474 // (EPIC-061/US-003 refactoring)
475
476 // NOTE: filter_by_similarity, execute_not_similarity_query, extract_not_similarity_condition,
477 // execute_scan_query moved to similarity_filter.rs (Plan 04-04)
478
479 // NOTE: execute_union_query, matches_metadata_filter, split_or_condition_with_outer_filter
480 // moved to union_query.rs (Plan 04-04)
481}