1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
//! Query planner — transforms user input into an optimal index query plan.
//!
//! Decomposes regex patterns into required trigram sets.
use crate::trigram::{Extractor, Trigram};
use regex::Regex;
use regex_syntax::hir::{Hir, HirKind};
#[derive(Debug)]
pub enum QueryPlan {
/// Fast path: literal string search
Literal {
pattern: Vec<u8>,
trigrams: Vec<Trigram>,
},
/// Regex with extractable literals
RegexWithLiterals {
regex: Regex,
required_trigram_sets: Vec<Vec<Trigram>>,
},
/// Case-insensitive indexed search. Each group = case variants for one
/// trigram position. Executor UNIONs within groups, INTERSECTs across.
CaseInsensitive {
regex: Regex,
trigram_groups: Vec<Vec<Trigram>>,
},
/// No literals extractable — full scan fallback
FullScan { regex: Regex },
}
pub struct Planner;
impl Planner {
pub fn plan(pattern: &str, is_regex: bool) -> QueryPlan {
Self::plan_with_options(pattern, is_regex, false, false)
}
pub fn plan_with_options(
pattern: &str,
is_regex: bool,
ignore_case: bool,
multiline: bool,
) -> QueryPlan {
let mut final_pattern = pattern.to_string();
if multiline && is_regex {
final_pattern = format!("(?s){}", final_pattern);
}
if !is_regex && !ignore_case {
let bytes = final_pattern.as_bytes().to_vec();
let trigrams = Extractor::extract_set(&bytes);
if trigrams.is_empty() {
// Pattern too short for trigrams (< 3 bytes)
let regex = Regex::new(®ex::escape(&final_pattern))
.unwrap_or_else(|_| Regex::new("").unwrap());
return QueryPlan::FullScan { regex };
}
return QueryPlan::Literal {
pattern: bytes,
trigrams,
};
}
// Case-insensitive literal: per-position trigram groups.
// Executor UNIONs within each group, INTERSECTs across groups.
if !is_regex && ignore_case {
let bytes = final_pattern.as_bytes();
let groups = Extractor::extract_groups_case_insensitive(bytes);
let regex_pat = format!("(?i){}", regex::escape(&final_pattern));
let regex = match Regex::new(®ex_pat) {
Ok(r) => r,
Err(_) => {
return QueryPlan::FullScan {
regex: Regex::new("").unwrap(),
};
}
};
if groups.is_empty() {
return QueryPlan::FullScan { regex };
}
return QueryPlan::CaseInsensitive {
regex,
trigram_groups: groups,
};
}
let regex_pat = if ignore_case && !final_pattern.starts_with("(?i)") {
format!("(?i){final_pattern}")
} else {
final_pattern.clone()
};
let regex = match Regex::new(®ex_pat) {
Ok(r) => r,
Err(_) => {
return QueryPlan::FullScan {
regex: Regex::new("").unwrap(),
};
} // Should be handled by CLI
};
let hir = match regex_syntax::parse(&final_pattern) {
Ok(h) => h,
Err(_) => return QueryPlan::FullScan { regex },
};
let mut literals = Vec::new();
Self::walk_hir(&hir, &mut literals);
// For case-insensitive regex, fall back to FullScan — the (?i) regex
// handles matching, and extracting trigram groups from regex literals
// adds complexity without much narrowing benefit.
if ignore_case {
return QueryPlan::FullScan { regex };
}
let required_trigram_sets: Vec<Vec<Trigram>> = literals
.iter()
.map(|lit| Extractor::extract_set(lit.as_bytes()))
.filter(|t| !t.is_empty())
.collect();
if required_trigram_sets.is_empty() {
QueryPlan::FullScan { regex }
} else {
QueryPlan::RegexWithLiterals {
regex,
required_trigram_sets,
}
}
}
fn walk_hir(hir: &Hir, out: &mut Vec<String>) {
match hir.kind() {
HirKind::Literal(lit) => {
out.push(String::from_utf8_lossy(&lit.0).to_string());
}
HirKind::Concat(children) => {
let mut current = String::new();
for child in children {
match child.kind() {
HirKind::Literal(lit) => {
current.push_str(&String::from_utf8_lossy(&lit.0));
}
_ => {
if current.len() >= 3 {
out.push(current.clone());
}
current.clear();
Self::walk_hir(child, out);
}
}
}
if current.len() >= 3 {
out.push(current);
}
}
HirKind::Repetition(rep) => {
if rep.min >= 1 {
Self::walk_hir(&rep.sub, out);
}
}
// Simplified: we don't extract from Alternation for now as per DESIGN.md
_ => {}
}
}
}