Skip to main content

ix/
planner.rs

1//! Query planner — transforms user input into an optimal index query plan.
2//!
3//! Decomposes regex patterns into required trigram sets.
4//! When a [`RegexPool`] is available, compiled
5//! regexes are sourced from the pool to avoid redundant compilation.
6
7use crate::regex_pool::RegexPool;
8use crate::trigram::{Extractor, Trigram};
9use regex::Regex;
10use regex_syntax::hir::{Hir, HirKind};
11
12/// Optimal query execution strategy chosen by the planner.
13#[derive(Debug)]
14pub enum QueryPlan {
15    /// Fast path: literal string search with pre-computed trigrams.
16    Literal {
17        /// Raw search bytes to match against.
18        pattern: Vec<u8>,
19        /// Set of trigrams extracted from the pattern.
20        trigrams: Vec<Trigram>,
21        /// Pre-compiled regex equivalent to the literal.
22        regex: Regex,
23    },
24
25    /// Regex with extractable literal sub-strings.
26    RegexWithLiterals {
27        /// Pre-compiled user regex.
28        regex: Regex,
29        /// Per-literal required trigram sets (AND across sets).
30        required_trigram_sets: Vec<Vec<Trigram>>,
31    },
32
33    /// Case-insensitive indexed search. Each group = case variants for one
34    /// trigram position. Executor UNIONs within groups, INTERSECTs across.
35    CaseInsensitive {
36        /// Pre-compiled case-insensitive regex.
37        regex: Regex,
38        /// Per-position trigram groups (union within, intersect across).
39        trigram_groups: Vec<Vec<Trigram>>,
40    },
41
42    /// No literals extractable — full scan fallback.
43    FullScan {
44        /// Pre-compiled regex to run against every file.
45        regex: Regex,
46    },
47}
48
49impl QueryPlan {
50    /// Return the regex pattern string for this plan.
51    ///
52    /// Used for cache keying (e.g. negative-result cache fingerprint).
53    #[must_use]
54    pub fn pattern_str(&self) -> &str {
55        match self {
56            Self::Literal { regex, .. }
57            | Self::RegexWithLiterals { regex, .. }
58            | Self::CaseInsensitive { regex, .. }
59            | Self::FullScan { regex } => regex.as_str(),
60        }
61    }
62}
63
64/// Query planning options.
65#[derive(Debug, Default, Clone, Copy)]
66#[allow(clippy::struct_excessive_bools)]
67pub struct QueryOptions {
68    /// Treat pattern as a regex rather than a literal string.
69    pub is_regex: bool,
70    /// Case-insensitive matching.
71    pub ignore_case: bool,
72    /// Enable multiline mode (dot matches newline). Requires `is_regex`.
73    pub multiline: bool,
74    /// Match only at word boundaries (wraps pattern in `\b...\b`).
75    /// Implies `is_regex` internally but enforces whole-word semantics.
76    pub word_boundary: bool,
77}
78
79/// Stateless query planner that decomposes user input into a [`QueryPlan`].
80pub struct Planner;
81
82impl Planner {
83    /// Plan a literal or regex query with default options (non-unicode,
84    /// case-sensitive). See [`plan_with_options`](Self::plan_with_options)
85    /// for full control.
86    #[must_use]
87    pub fn plan(pattern: &str, is_regex: bool) -> QueryPlan {
88        Self::plan_with_options(
89            pattern,
90            QueryOptions {
91                is_regex,
92                ..Default::default()
93            },
94        )
95    }
96
97    /// Plan a query using a regex pool to avoid redundant compilation.
98    ///
99    /// Identical to [`plan_with_options`](Self::plan_with_options) but
100    /// sources compiled regexes from `pool` when available.
101    #[must_use]
102    pub fn plan_with_pool(pattern: &str, options: QueryOptions, pool: &RegexPool) -> QueryPlan {
103        Self::plan_impl(pattern, options, Some(pool))
104    }
105
106    /// Plan a query with full options.
107    ///
108    /// # Panics
109    ///
110    /// Panics if `Regex::new("")` fails (which should never happen since an empty
111    /// pattern is always valid).
112    #[must_use]
113    pub fn plan_with_options(pattern: &str, options: QueryOptions) -> QueryPlan {
114        Self::plan_impl(pattern, options, None)
115    }
116
117    fn compile_regex(pat: &str, pool: Option<&RegexPool>) -> Regex {
118        if let Some(p) = pool
119            && let Ok(re) = p.get_or_compile(pat)
120        {
121            return re;
122        }
123        Regex::new(pat).unwrap_or_else(|_| {
124            pool.map_or_else(
125                || Regex::new("").expect("empty regex should always compile"),
126                |p| p.get_or_compile("").expect("empty regex always compiles"),
127            )
128        })
129    }
130
131    fn plan_impl(pattern: &str, options: QueryOptions, pool: Option<&RegexPool>) -> QueryPlan {
132        let mut final_pattern = pattern.to_string();
133        let mut use_regex = options.is_regex;
134        if options.multiline && use_regex {
135            final_pattern = format!("(?s){final_pattern}");
136        }
137
138        // Word-boundary mode: wrap literal in \b word boundaries.
139        // When word_boundary=true, we use regex mode internally but the semantics
140        // are "whole word match" rather than arbitrary regex.
141        if options.word_boundary && !options.is_regex {
142            if options.ignore_case {
143                final_pattern = format!("(?i)\\b{}\\b", regex::escape(&final_pattern));
144            } else {
145                final_pattern = format!("\\b{}\\b", regex::escape(&final_pattern));
146            }
147            use_regex = true;
148        }
149
150        if !use_regex && !options.ignore_case {
151            let bytes = final_pattern.as_bytes().to_vec();
152            let trigrams = Extractor::extract_set(&bytes);
153
154            let escaped = regex::escape(&final_pattern);
155            let regex = Self::compile_regex(&escaped, pool);
156
157            if trigrams.is_empty() {
158                return QueryPlan::FullScan { regex };
159            }
160
161            return QueryPlan::Literal {
162                pattern: bytes,
163                trigrams,
164                regex,
165            };
166        }
167
168        // Case-insensitive literal: per-position trigram groups.
169        // Executor UNIONs within each group, INTERSECTs across groups.
170        if !use_regex && options.ignore_case {
171            let bytes = final_pattern.as_bytes();
172            let groups = Extractor::extract_groups_case_insensitive(bytes);
173            let regex_pat = format!("(?i){}", regex::escape(&final_pattern));
174            let regex = Self::compile_regex(&regex_pat, pool);
175
176            if groups.is_empty() {
177                return QueryPlan::FullScan { regex };
178            }
179
180            return QueryPlan::CaseInsensitive {
181                regex,
182                trigram_groups: groups,
183            };
184        }
185
186        let regex_pat = if options.ignore_case && !final_pattern.starts_with("(?i)") {
187            format!("(?i){final_pattern}")
188        } else {
189            final_pattern.clone()
190        };
191
192        let regex = Self::compile_regex(&regex_pat, pool);
193
194        let Ok(hir) = regex_syntax::parse(&final_pattern) else {
195            return QueryPlan::FullScan { regex };
196        };
197
198        let mut literals = Vec::new();
199        Self::walk_hir(&hir, &mut literals);
200
201        // For case-insensitive regex, fall back to FullScan — the (?i) regex
202        // handles matching, and extracting trigram groups from regex literals
203        // adds complexity without much narrowing benefit.
204        if options.ignore_case {
205            return QueryPlan::FullScan { regex };
206        }
207
208        let required_trigram_sets: Vec<Vec<Trigram>> = literals
209            .iter()
210            .map(|lit| Extractor::extract_set(lit.as_bytes()))
211            .filter(|t| !t.is_empty())
212            .collect();
213
214        if required_trigram_sets.is_empty() {
215            QueryPlan::FullScan { regex }
216        } else {
217            QueryPlan::RegexWithLiterals {
218                regex,
219                required_trigram_sets,
220            }
221        }
222    }
223
224    fn walk_hir(hir: &Hir, out: &mut Vec<String>) {
225        match hir.kind() {
226            HirKind::Literal(lit) => {
227                out.push(String::from_utf8_lossy(&lit.0).to_string());
228            }
229            HirKind::Concat(children) => {
230                let mut current = String::new();
231                for child in children {
232                    if let HirKind::Literal(lit) = child.kind() {
233                        current.push_str(&String::from_utf8_lossy(&lit.0));
234                    } else {
235                        if current.len() >= 3 {
236                            out.push(current.clone());
237                        }
238                        current.clear();
239                        Self::walk_hir(child, out);
240                    }
241                }
242                if current.len() >= 3 {
243                    out.push(current);
244                }
245            }
246            HirKind::Repetition(rep) if rep.min >= 1 => {
247                Self::walk_hir(&rep.sub, out);
248            }
249            // Simplified: we don't extract from Alternation for now as per DESIGN.md
250            _ => {}
251        }
252    }
253}