reflex-search 1.0.2

A local-first, structure-aware code search engine for AI agents
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
//! Extract literal sequences from regex patterns for trigram optimization
//!
//! This module implements literal extraction from regular expressions to enable
//! fast regex search using the trigram index. The key insight is that many regex
//! patterns contain literal substrings that can narrow down candidate files.
//!
//! # Strategy
//!
//! 1. Extract all literal sequences from the regex pattern (≥3 chars)
//! 2. Generate trigrams from those literals
//! 3. Use trigrams to find files containing ANY literal (UNION approach)
//! 4. Verify actual matches with the regex engine
//!
//! # File Selection: UNION vs INTERSECTION
//!
//! For correctness, we use **UNION** (files with ANY literal):
//! - Alternation `(a|b)` needs files with a OR b → UNION is correct ✓
//! - Sequential `a.*b` needs files with a AND b → UNION includes extra files but still correct ✓
//!
//! Trade-off: UNION may scan 2-3x more files for sequential patterns, but ensures
//! we never miss matches. Performance impact is minimal (<5ms) due to memory-mapped I/O.
//!
//! # Examples
//!
//! - `fn\s+test_.*` → extracts "test_" → searches files containing "test_"
//! - `(class|function)` → extracts ["class", "function"] → searches files with class OR function
//! - `class.*Controller` → extracts ["class", "Controller"] → searches files with class OR Controller
//! - `(?i)test` → case-insensitive → triggers full scan (no literals)
//! - `.*` → no literals → fall back to full scan
//!
//! # References
//!
//! - Russ Cox - Regular Expression Matching with a Trigram Index
//!   https://swtch.com/~rsc/regexp/regexp4.html

use crate::trigram::{extract_trigrams, Trigram};

/// Extract guaranteed trigrams from a regex pattern
///
/// Returns trigrams that MUST appear in any string matching the pattern.
/// These trigrams are used to narrow down candidate files before running
/// the full regex match.
///
/// # Algorithm (MVP - Simple Literal Extraction)
///
/// 1. Split pattern on regex metacharacters: . * + ? | ( ) [ ] { } ^ $ \
/// 2. Keep literal sequences of 3+ characters
/// 3. Extract trigrams from each literal sequence
/// 4. Return all trigrams
///
/// # Examples
///
/// ```
/// use reflex::regex_trigrams::extract_trigrams_from_regex;
///
/// // Simple literal
/// let trigrams = extract_trigrams_from_regex("extract_symbols");
/// assert!(!trigrams.is_empty());
///
/// // Pattern with wildcard
/// let trigrams = extract_trigrams_from_regex("fn.*test");
/// assert!(!trigrams.is_empty()); // Has "fn " and "test"
///
/// // No literals
/// let trigrams = extract_trigrams_from_regex(".*");
/// assert!(trigrams.is_empty()); // Must fall back to full scan
/// ```
pub fn extract_trigrams_from_regex(pattern: &str) -> Vec<Trigram> {
    let literals = extract_literal_sequences(pattern);

    if literals.is_empty() {
        log::debug!("No literals found in regex pattern '{}', will fall back to full scan", pattern);
        return vec![];
    }

    log::debug!("Extracted {} literal sequences from regex: {:?}", literals.len(), literals);

    // Extract trigrams from all literal sequences
    let mut all_trigrams = Vec::new();
    for literal in literals {
        let trigrams = extract_trigrams(&literal);
        all_trigrams.extend(trigrams);
    }

    // Deduplicate trigrams
    all_trigrams.sort_unstable();
    all_trigrams.dedup();

    log::debug!("Extracted {} unique trigrams from regex pattern", all_trigrams.len());
    all_trigrams
}

/// Extract literal sequences (≥3 chars) from a regex pattern
///
/// This is a simple heuristic that splits on regex metacharacters.
/// It doesn't parse the full regex AST but works for common patterns.
///
/// # Regex Metacharacters
///
/// The following characters are treated as non-literal:
/// - Wildcards: `.` `*` `+` `?`
/// - Alternation: `|`
/// - Grouping: `(` `)`
/// - Character classes: `[` `]`
/// - Anchors: `^` `$`
/// - Escapes: `\` (followed by special char)
/// - Quantifiers: `{` `}`
///
/// # Case-Insensitive Patterns
///
/// Patterns with case-insensitive flags like `(?i)` return an empty vector,
/// forcing a full scan. This is because we cannot reliably extract trigrams
/// for case-insensitive matching (would need all case variations).
///
/// # Examples
///
/// ```
/// use reflex::regex_trigrams::extract_literal_sequences;
///
/// assert_eq!(extract_literal_sequences("hello"), vec!["hello"]);
/// assert_eq!(extract_literal_sequences("fn.*test"), vec!["test"]);
/// assert_eq!(extract_literal_sequences("class.*Controller"), vec!["class", "Controller"]);
///
/// // Case-insensitive patterns return empty (triggers full scan)
/// assert_eq!(extract_literal_sequences("(?i)test"), Vec::<String>::new());
/// ```
pub fn extract_literal_sequences(pattern: &str) -> Vec<String> {
    let mut sequences = Vec::new();
    let mut current = String::new();
    let mut chars = pattern.chars().peekable();
    let mut has_case_insensitive_flag = false;

    while let Some(ch) = chars.next() {
        match ch {
            // Regex metacharacters - break the literal sequence
            '.' | '*' | '+' | '?' | '|' | '[' | ']' | '^' | '$' => {
                if current.len() >= 3 {
                    sequences.push(current.clone());
                }
                current.clear();
            }

            // Opening parenthesis - check for inline flags
            '(' => {
                // Save current sequence before clearing
                if current.len() >= 3 {
                    sequences.push(current.clone());
                }
                current.clear();

                // Check if this is an inline flag like (?i), (?m), (?s), (?x), or (?:...)
                if chars.peek() == Some(&'?') {
                    chars.next(); // consume '?'

                    // Peek at the next character to determine the type of group
                    if let Some(&flag_ch) = chars.peek() {
                        match flag_ch {
                            // Non-capturing group (?:...) - only skip the ?: part, not the contents
                            ':' => {
                                chars.next(); // consume ':'
                                // The contents of (?:...) are processed normally
                            }
                            // Inline flags: (?i) (?m) (?s) (?x) (?i-m) etc.
                            // These are standalone modifiers, consume until ')'
                            'i' | 'm' | 's' | 'x' | '-' => {
                                // Check if 'i' flag is present (case-insensitive)
                                if flag_ch == 'i' {
                                    has_case_insensitive_flag = true;
                                }

                                // Consume flag characters and closing ')'
                                while let Some(&next_ch) = chars.peek() {
                                    if next_ch == 'i' {
                                        has_case_insensitive_flag = true;
                                    }
                                    chars.next();
                                    if next_ch == ')' {
                                        break;
                                    }
                                }
                            }
                            // Other special groups (lookahead, lookbehind, etc.) - skip entirely
                            _ => {
                                // Consume until closing ')'
                                while let Some(&next_ch) = chars.peek() {
                                    chars.next();
                                    if next_ch == ')' {
                                        break;
                                    }
                                }
                            }
                        }
                    }
                }
            }

            // Closing parenthesis
            ')' => {
                if current.len() >= 3 {
                    sequences.push(current.clone());
                }
                current.clear();
            }

            // Opening brace - quantifier, consume until closing brace
            '{' => {
                if current.len() >= 3 {
                    sequences.push(current.clone());
                }
                current.clear();

                // Consume quantifier contents to avoid treating "2,3" as a literal
                while let Some(&next_ch) = chars.peek() {
                    chars.next();
                    if next_ch == '}' {
                        break;
                    }
                }
            }

            // Closing brace
            '}' => {
                if current.len() >= 3 {
                    sequences.push(current.clone());
                }
                current.clear();
            }

            // Backslash escapes
            '\\' => {
                if let Some(&next_ch) = chars.peek() {
                    match next_ch {
                        // Common escape sequences that represent single characters
                        's' | 'd' | 'w' | 'S' | 'D' | 'W' | 'n' | 't' | 'r' | 'b' | 'B' => {
                            // These are not literal characters, break the sequence
                            chars.next(); // consume the escaped char
                            if current.len() >= 3 {
                                sequences.push(current.clone());
                            }
                            current.clear();
                        }
                        // Escaped metacharacter - treat as literal
                        _ => {
                            chars.next(); // consume the escaped char
                            current.push(next_ch);
                        }
                    }
                } else {
                    // Backslash at end of pattern - ignore
                    if current.len() >= 3 {
                        sequences.push(current.clone());
                    }
                    current.clear();
                }
            }

            // Regular literal character
            _ => {
                current.push(ch);
            }
        }
    }

    // Don't forget the last sequence
    if current.len() >= 3 {
        sequences.push(current);
    }

    // If case-insensitive flag detected, return empty to force full scan
    // Cannot reliably extract trigrams for case-insensitive matching
    if has_case_insensitive_flag {
        log::debug!("Case-insensitive flag detected in pattern, cannot use trigram optimization");
        return vec![];
    }

    sequences
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_extract_literal_sequences_simple() {
        let sequences = extract_literal_sequences("hello");
        assert_eq!(sequences, vec!["hello"]);
    }

    #[test]
    fn test_extract_literal_sequences_with_wildcard() {
        let sequences = extract_literal_sequences("fn.*test");
        assert_eq!(sequences, vec!["test"]);
    }

    #[test]
    fn test_extract_literal_sequences_multiple() {
        let sequences = extract_literal_sequences("class.*Controller");
        assert_eq!(sequences, vec!["class", "Controller"]);
    }

    #[test]
    fn test_extract_literal_sequences_no_literals() {
        let sequences = extract_literal_sequences(".*");
        assert!(sequences.is_empty());
    }

    #[test]
    fn test_extract_literal_sequences_short_literals() {
        // "fn" is only 2 chars, should be skipped
        let sequences = extract_literal_sequences("fn.*test");
        assert_eq!(sequences, vec!["test"]);
    }

    #[test]
    fn test_extract_literal_sequences_escaped() {
        // \. is escaped period, should be literal
        let sequences = extract_literal_sequences("test\\.txt");
        assert_eq!(sequences, vec!["test.txt"]);
    }

    #[test]
    fn test_extract_literal_sequences_whitespace_escape() {
        // \s is whitespace class, not literal
        let sequences = extract_literal_sequences("fn\\s+extract");
        assert_eq!(sequences, vec!["extract"]);
    }

    #[test]
    fn test_extract_literal_sequences_word_boundary() {
        // \b is word boundary, not literal
        let sequences = extract_literal_sequences("\\bListUsersController\\b");
        assert_eq!(sequences, vec!["ListUsersController"]);
    }

    #[test]
    fn test_extract_trigrams_simple_literal() {
        let trigrams = extract_trigrams_from_regex("extract");
        // "extract" has 5 trigrams: "ext", "xtr", "tra", "rac", "act"
        assert_eq!(trigrams.len(), 5);
    }

    #[test]
    fn test_extract_trigrams_with_wildcard() {
        let trigrams = extract_trigrams_from_regex("fn.*test");
        // "test" has 2 trigrams: "tes", "est"
        assert_eq!(trigrams.len(), 2);
    }

    #[test]
    fn test_extract_trigrams_multiple_literals() {
        let trigrams = extract_trigrams_from_regex("class.*Controller");
        // "class" has 3 trigrams, "Controller" has 8
        // Total unique: 11
        assert!(trigrams.len() >= 10); // At least 10 unique trigrams
    }

    #[test]
    fn test_extract_trigrams_no_literals() {
        let trigrams = extract_trigrams_from_regex(".*");
        assert!(trigrams.is_empty());
    }

    #[test]
    fn test_extract_trigrams_complex_pattern() {
        // "(function|const)\s+\w+\s*=" has "function" and "const" as literals
        let trigrams = extract_trigrams_from_regex("(function|const)");
        // "function" has 6 trigrams, "const" has 3
        assert!(trigrams.len() >= 6);
    }

    #[test]
    fn test_extract_literal_sequences_alternation() {
        // Alternation patterns should extract all alternatives as separate literals
        let sequences = extract_literal_sequences("(SymbolWriter|ContentWriter)");
        assert_eq!(sequences, vec!["SymbolWriter", "ContentWriter"]);
    }

    #[test]
    fn test_extract_literal_sequences_three_way_alternation() {
        // Three-way alternation
        let sequences = extract_literal_sequences("(Indexer|QueryEngine|CacheManager)");
        assert_eq!(sequences, vec!["Indexer", "QueryEngine", "CacheManager"]);
    }

    #[test]
    fn test_extract_literal_sequences_case_insensitive_flag() {
        // Case-insensitive flag should trigger full scan (return empty)
        let sequences = extract_literal_sequences("(?i)queryengine");
        assert_eq!(sequences, Vec::<String>::new());
    }

    #[test]
    fn test_extract_literal_sequences_multiline_flag() {
        // Multiline flag should be skipped
        let sequences = extract_literal_sequences("(?m)^test");
        assert_eq!(sequences, vec!["test"]);
    }

    #[test]
    fn test_extract_literal_sequences_non_capturing_group() {
        // Non-capturing group (?:...) should not extract flag chars
        let sequences = extract_literal_sequences("(?:test|func)");
        assert_eq!(sequences, vec!["test", "func"]);
    }

    #[test]
    fn test_extract_literal_sequences_quantifier_no_false_literal() {
        // Quantifier contents should NOT become a literal
        let sequences = extract_literal_sequences("a{2,3}test");
        assert_eq!(sequences, vec!["test"]);

        // Ensure "2,3" is NOT extracted
        assert!(!sequences.contains(&"2,3".to_string()));
    }

    #[test]
    fn test_extract_literal_sequences_quantifier_range() {
        // Test various quantifier formats
        let sequences = extract_literal_sequences("test{1,5}word");
        assert_eq!(sequences, vec!["test", "word"]);
        assert!(!sequences.contains(&"1,5".to_string()));
    }

    #[test]
    fn test_extract_literal_sequences_quantifier_exact() {
        // Exact quantifier {n}
        let sequences = extract_literal_sequences("test{3}word");
        assert_eq!(sequences, vec!["test", "word"]);
        assert!(!sequences.contains(&"3".to_string()));
    }

    #[test]
    fn test_extract_literal_sequences_combined_flags() {
        // Multiple inline flags including 'i' should trigger full scan
        let sequences = extract_literal_sequences("(?im)test");
        assert_eq!(sequences, Vec::<String>::new());
    }

    #[test]
    fn test_extract_literal_sequences_flag_before_literal() {
        // Flag with 'i' at start should trigger full scan
        let sequences = extract_literal_sequences("(?i)test.*function");
        assert_eq!(sequences, Vec::<String>::new());
    }
}