Skip to main content

imp_core/tools/
query.rs

1//! Boolean query parser with stemming for code search.
2//!
3//! Supports Elasticsearch-style queries:
4//! - Boolean: "error AND handling", "login OR auth", "database NOT sqlite"
5//! - Phrases: `"user authentication"` (exact match)
6//! - Plain terms: matched with optional stemming
7//!
8//! Default operator between terms is OR.
9
10use std::path::Path;
11
12use regex::Regex;
13
14/// A parsed search query with boolean semantics.
15#[derive(Debug)]
16pub struct Query {
17    /// All must match (AND).
18    pub must: Vec<Matcher>,
19    /// At least one must match (OR / default).
20    pub should: Vec<Matcher>,
21    /// None may match (NOT).
22    pub must_not: Vec<Matcher>,
23}
24
25/// A compiled matcher for a single term or phrase.
26#[derive(Debug)]
27pub struct Matcher {
28    pub regex: Regex,
29    pub original: String,
30}
31
32impl Query {
33    /// Does this query match a given line?
34    pub fn matches_line(&self, line: &str) -> bool {
35        // All must_not must NOT match
36        if self.must_not.iter().any(|m| m.regex.is_match(line)) {
37            return false;
38        }
39
40        // All must terms must match
41        let must_ok = self.must.is_empty() || self.must.iter().all(|m| m.regex.is_match(line));
42
43        // At least one should term must match (if any exist)
44        let should_ok =
45            self.should.is_empty() || self.should.iter().any(|m| m.regex.is_match(line));
46
47        must_ok && should_ok
48    }
49
50    /// Does this query match anywhere in a file's content?
51    /// For AND semantics: each must term appears somewhere in the file.
52    /// For OR semantics: at least one should term appears somewhere.
53    /// For NOT: no must_not term appears anywhere.
54    pub fn matches_file(&self, content: &str) -> bool {
55        if self
56            .must_not
57            .iter()
58            .any(|m| content.lines().any(|l| m.regex.is_match(l)))
59        {
60            return false;
61        }
62
63        let must_ok = self
64            .must
65            .iter()
66            .all(|m| content.lines().any(|l| m.regex.is_match(l)));
67
68        let should_ok = self.should.is_empty()
69            || self
70                .should
71                .iter()
72                .any(|m| content.lines().any(|l| m.regex.is_match(l)));
73
74        must_ok && should_ok
75    }
76
77    /// Find lines in content that match any positive term.
78    /// Used after `matches_file` confirms the file is relevant.
79    pub fn matching_lines(&self, content: &str) -> Vec<usize> {
80        content
81            .lines()
82            .enumerate()
83            .filter(|(_, line)| {
84                let any_positive = self.must.iter().any(|m| m.regex.is_match(line))
85                    || self.should.iter().any(|m| m.regex.is_match(line));
86                let no_negative = !self.must_not.iter().any(|m| m.regex.is_match(line));
87                any_positive && no_negative
88            })
89            .map(|(idx, _)| idx)
90            .collect()
91    }
92
93    /// Is this a simple single-term query? (optimization path)
94    pub fn is_simple(&self) -> bool {
95        self.must.is_empty() && self.must_not.is_empty() && self.should.len() == 1
96    }
97
98    /// Get the single regex for simple queries.
99    pub fn simple_regex(&self) -> Option<&Regex> {
100        if self.is_simple() {
101            Some(&self.should[0].regex)
102        } else {
103            None
104        }
105    }
106}
107
108/// Parse a query string into a structured Query.
109///
110/// Syntax:
111/// - `term` → OR term (default)
112/// - `term1 AND term2` → both must match
113/// - `term1 OR term2` → either must match
114/// - `NOT term` → must not match
115/// - `term1 AND NOT term2` → term1 required, term2 excluded
116/// - `"exact phrase"` → literal phrase match
117///
118/// When `exact` is false, plain terms use stemming (suffix stripping + prefix match).
119/// When `exact` is true, terms use word-boundary matching.
120pub fn parse(input: &str, exact: bool, ignore_case: bool) -> std::result::Result<Query, String> {
121    let tokens = tokenize(input);
122
123    let mut must = Vec::new();
124    let mut should = Vec::new();
125    let mut must_not = Vec::new();
126    let mut has_and = false;
127    let mut next_not = false;
128
129    // First pass: detect if AND is used (changes default grouping)
130    for token in &tokens {
131        if matches!(token, Token::And) {
132            has_and = true;
133            break;
134        }
135    }
136
137    for token in tokens {
138        match token {
139            Token::And => {
140                // AND is implicit in the grouping logic
141            }
142            Token::Or => {
143                // Next term goes to should
144            }
145            Token::Not => {
146                next_not = true;
147            }
148            Token::Term(term) => {
149                let matcher = build_matcher(&term, false, exact, ignore_case)?;
150                if next_not {
151                    must_not.push(matcher);
152                    next_not = false;
153                } else if has_and {
154                    must.push(matcher);
155                } else {
156                    should.push(matcher);
157                }
158            }
159            Token::Phrase(phrase) => {
160                let matcher = build_matcher(&phrase, true, true, ignore_case)?;
161                if next_not {
162                    must_not.push(matcher);
163                    next_not = false;
164                } else if has_and {
165                    must.push(matcher);
166                } else {
167                    should.push(matcher);
168                }
169            }
170        }
171    }
172
173    Ok(Query {
174        must,
175        should,
176        must_not,
177    })
178}
179
180fn build_matcher(
181    term: &str,
182    is_phrase: bool,
183    exact: bool,
184    ignore_case: bool,
185) -> std::result::Result<Matcher, String> {
186    let pattern = if is_phrase || exact {
187        // Exact: escape and use word boundaries
188        format!(r"\b{}\b", regex::escape(term))
189    } else {
190        // Stemmed: strip suffix, match as prefix
191        let stemmed = stem(term);
192        if stemmed.len() < term.len() {
193            // Stem is shorter — match stem followed by optional word chars
194            format!(r"(?i)\b{}\w*", regex::escape(&stemmed))
195        } else {
196            // No stemming applied — just case-insensitive match
197            regex::escape(term)
198        }
199    };
200
201    let regex = regex::RegexBuilder::new(&pattern)
202        .case_insensitive(ignore_case || (!exact && !is_phrase))
203        .build()
204        .map_err(|e| format!("invalid pattern '{term}': {e}"))?;
205
206    Ok(Matcher {
207        regex,
208        original: term.to_string(),
209    })
210}
211
212// ── tokenizer ───────────────────────────────────────────────────────
213
214#[derive(Debug, PartialEq)]
215enum Token {
216    Term(String),
217    Phrase(String),
218    And,
219    Or,
220    Not,
221}
222
223fn tokenize(input: &str) -> Vec<Token> {
224    let mut tokens = Vec::new();
225    let mut chars = input.chars().peekable();
226
227    while let Some(&ch) = chars.peek() {
228        if ch.is_whitespace() {
229            chars.next();
230            continue;
231        }
232
233        if ch == '"' {
234            chars.next(); // consume opening quote
235            let mut phrase = String::new();
236            while let Some(&c) = chars.peek() {
237                if c == '"' {
238                    chars.next(); // consume closing quote
239                    break;
240                }
241                phrase.push(c);
242                chars.next();
243            }
244            if !phrase.is_empty() {
245                tokens.push(Token::Phrase(phrase));
246            }
247            continue;
248        }
249
250        // Collect a word
251        let mut word = String::new();
252        while let Some(&c) = chars.peek() {
253            if c.is_whitespace() || c == '"' {
254                break;
255            }
256            word.push(c);
257            chars.next();
258        }
259
260        match word.as_str() {
261            "AND" => tokens.push(Token::And),
262            "OR" => tokens.push(Token::Or),
263            "NOT" => tokens.push(Token::Not),
264            _ => tokens.push(Token::Term(word)),
265        }
266    }
267
268    tokens
269}
270
271// ── stemming ────────────────────────────────────────────────────────
272
273/// Simple English suffix stripping for code search.
274///
275/// Strips one layer of common suffixes to produce a stem that matches
276/// related word forms. Conservative: requires at least 4 chars remaining
277/// after stripping to avoid over-stemming.
278///
279/// Examples:
280///   authenticate → authenticat (matches authentication, authenticated)
281///   handling → handl (matches handler, handle, handled)
282///   errors → error
283///   connection → connect (matches connected, connecting)
284pub fn stem(word: &str) -> String {
285    let w = word.to_lowercase();
286
287    // Ordered by suffix length (longest first) to strip the most specific suffix
288    let suffixes = &[
289        "ication", // authentication → authent
290        "ation",   // authorization → authoriz
291        "ition",   // definition → defin
292        "ction",   // connection → conne  (try before "tion")
293        "tion",    // completion → comple
294        "sion",    // expression → expres
295        "ment",    // management → manage
296        "ness",    // readiness → readi
297        "able",    // readable → read
298        "ible",    // accessible → access
299        "ally",    // automatically → automatic
300        "ence",    // reference → refer
301        "ance",    // performance → perform
302        "ings",    // settings → sett
303        "ated",    // authenticated → authentic
304        "ized",    // authorized → author
305        "ised",    // recognised → recogn
306        "cting",   // connecting → conne (via "cting" before "ting")
307        "less",    // careless → care
308        "ful",     // successful → success
309        "ous",     // dangerous → danger
310        "ive",     // recursive → recurs
311        "ity",     // security → secur
312        "ing",     // handling → handl
313        "ers",     // handlers → handl
314        "ies",     // queries → quer
315        "ied",     // applied → appl
316        "ion",     // expression → express
317        "ed",      // connected → connect, handled → handl
318        "er",      // handler → handl
319        "ate",     // authenticate → authentic\n        "es",      // matches → match
320        "ly",      // directly → direct
321        "al",      // functional → function
322        "or",      // executor → execut
323        "ar",      // similar → simil
324        "s",       // errors → error
325    ];
326
327    let min_stem = 4;
328
329    for suffix in suffixes {
330        if w.len() > suffix.len() + min_stem && w.ends_with(suffix) {
331            return w[..w.len() - suffix.len()].to_string();
332        }
333    }
334
335    w
336}
337
338// ── language extension mapping ──────────────────────────────────────
339
340/// Map a language name to file extensions for filtering.
341pub fn language_extensions(language: &str) -> Option<&'static [&'static str]> {
342    match language.to_lowercase().as_str() {
343        "rust" | "rs" => Some(&["rs"]),
344        "typescript" | "ts" => Some(&["ts", "tsx"]),
345        "javascript" | "js" => Some(&["js", "jsx", "mjs", "cjs"]),
346        "python" | "py" => Some(&["py", "pyi"]),
347        "go" | "golang" => Some(&["go"]),
348        "java" => Some(&["java"]),
349        "ruby" | "rb" => Some(&["rb"]),
350        "c" => Some(&["c", "h"]),
351        "cpp" | "c++" | "cxx" => Some(&["cpp", "cc", "cxx", "hpp", "hxx", "h"]),
352        "swift" => Some(&["swift"]),
353        "php" => Some(&["php"]),
354        "elixir" | "ex" => Some(&["ex", "exs"]),
355        "zig" => Some(&["zig"]),
356        "lua" => Some(&["lua"]),
357        "shell" | "bash" | "sh" => Some(&["sh", "bash", "zsh"]),
358        "toml" => Some(&["toml"]),
359        "yaml" | "yml" => Some(&["yaml", "yml"]),
360        "json" => Some(&["json"]),
361        "markdown" | "md" => Some(&["md", "markdown"]),
362        _ => None,
363    }
364}
365
366/// Patterns that identify test files.
367pub fn is_test_file(path: &Path) -> bool {
368    let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
369    let name_lower = name.to_lowercase();
370
371    // Common test file patterns
372    name_lower.ends_with("_test.go")
373        || name_lower.ends_with("_test.rs")
374        || name_lower.ends_with(".test.ts")
375        || name_lower.ends_with(".test.tsx")
376        || name_lower.ends_with(".test.js")
377        || name_lower.ends_with(".test.jsx")
378        || name_lower.ends_with(".spec.ts")
379        || name_lower.ends_with(".spec.tsx")
380        || name_lower.ends_with(".spec.js")
381        || name_lower.ends_with(".spec.jsx")
382        || name_lower.starts_with("test_")
383        || name_lower == "conftest.py"
384        || path.components().any(|c| {
385            let s = c.as_os_str().to_str().unwrap_or("");
386            s == "tests" || s == "test" || s == "__tests__" || s == "spec" || s == "specs"
387        })
388}
389
390#[cfg(test)]
391mod tests {
392    use super::*;
393
394    #[test]
395    fn tokenize_simple() {
396        let tokens = tokenize("hello world");
397        assert_eq!(
398            tokens,
399            vec![Token::Term("hello".into()), Token::Term("world".into())]
400        );
401    }
402
403    #[test]
404    fn tokenize_boolean() {
405        let tokens = tokenize("error AND handling");
406        assert_eq!(
407            tokens,
408            vec![
409                Token::Term("error".into()),
410                Token::And,
411                Token::Term("handling".into())
412            ]
413        );
414    }
415
416    #[test]
417    fn tokenize_not() {
418        let tokens = tokenize("database NOT sqlite");
419        assert_eq!(
420            tokens,
421            vec![
422                Token::Term("database".into()),
423                Token::Not,
424                Token::Term("sqlite".into())
425            ]
426        );
427    }
428
429    #[test]
430    fn tokenize_phrase() {
431        let tokens = tokenize("\"user authentication\" AND error");
432        assert_eq!(
433            tokens,
434            vec![
435                Token::Phrase("user authentication".into()),
436                Token::And,
437                Token::Term("error".into())
438            ]
439        );
440    }
441
442    #[test]
443    fn tokenize_or() {
444        let tokens = tokenize("login OR auth");
445        assert_eq!(
446            tokens,
447            vec![
448                Token::Term("login".into()),
449                Token::Or,
450                Token::Term("auth".into())
451            ]
452        );
453    }
454
455    #[test]
456    fn parse_and_query() {
457        let q = parse("error AND handling", false, false).unwrap();
458        assert_eq!(q.must.len(), 2);
459        assert_eq!(q.should.len(), 0);
460        assert!(q.matches_line("error handling here"));
461        assert!(!q.matches_line("just an error"));
462    }
463
464    #[test]
465    fn parse_or_query() {
466        let q = parse("login OR auth", false, false).unwrap();
467        assert_eq!(q.should.len(), 2);
468        assert!(q.matches_line("login page"));
469        assert!(q.matches_line("auth token"));
470        assert!(!q.matches_line("nothing here"));
471    }
472
473    #[test]
474    fn parse_not_query() {
475        let q = parse("database NOT sqlite", false, false).unwrap();
476        assert!(q.matches_line("database connection"));
477        assert!(!q.matches_line("database sqlite connection"));
478    }
479
480    #[test]
481    fn parse_phrase_query() {
482        let q = parse("\"user authentication\"", true, false).unwrap();
483        assert!(q.matches_line("the user authentication system"));
484        assert!(!q.matches_line("the user and authentication"));
485    }
486
487    #[test]
488    fn parse_simple_term() {
489        let q = parse("ToolOutput", false, false).unwrap();
490        assert!(q.is_simple());
491        assert!(q.matches_line("fn text() -> ToolOutput {"));
492    }
493
494    #[test]
495    fn stem_common_words() {
496        assert_eq!(stem("authentication"), "authent");
497        assert_eq!(stem("handling"), "handl");
498        assert_eq!(stem("errors"), "error");
499        assert_eq!(stem("handler"), "handl");
500        assert_eq!(stem("performance"), "perform");
501        // "connected": strips "ed" → "connect" (min_stem=4, 9>2+4)
502        assert_eq!(stem("connected"), "connect");
503    }
504
505    #[test]
506    fn stem_short_words_unchanged() {
507        // Words too short to stem safely
508        assert_eq!(stem("run"), "run");
509        assert_eq!(stem("go"), "go");
510        assert_eq!(stem("get"), "get");
511    }
512
513    #[test]
514    fn stemmed_search_matches_variants() {
515        let q = parse("authenticate", false, false).unwrap();
516        // Should match via stemming: "authenticat" prefix
517        assert!(q.matches_line("authentication required"));
518        assert!(q.matches_line("authenticated user"));
519        assert!(q.matches_line("fn authenticate()"));
520    }
521
522    #[test]
523    fn exact_search_no_stemming() {
524        let q = parse("authenticate", true, false).unwrap();
525        assert!(q.matches_line("fn authenticate() {"));
526        // Exact mode: word boundary, so "authentication" should NOT match
527        assert!(!q.matches_line("authentication required"));
528    }
529
530    #[test]
531    fn file_level_and_matching() {
532        let content = "fn handle_error() {\n    log(\"something\");\n}\n\nfn setup_logging() {\n    // configure\n}";
533        let q = parse("error AND logging", false, false).unwrap();
534        assert!(q.matches_file(content));
535
536        let q2 = parse("error AND database", false, false).unwrap();
537        assert!(!q2.matches_file(content));
538    }
539
540    #[test]
541    fn test_file_detection() {
542        assert!(is_test_file(Path::new("src/auth_test.go")));
543        assert!(is_test_file(Path::new("src/auth.test.ts")));
544        assert!(is_test_file(Path::new("src/auth.spec.js")));
545        assert!(is_test_file(Path::new("test_auth.py")));
546        assert!(is_test_file(Path::new("tests/integration.rs")));
547        assert!(is_test_file(Path::new("__tests__/auth.tsx")));
548        assert!(!is_test_file(Path::new("src/auth.rs")));
549        assert!(!is_test_file(Path::new("src/main.ts")));
550    }
551
552    #[test]
553    fn language_extension_mapping() {
554        assert_eq!(language_extensions("rust"), Some(&["rs"][..]));
555        assert_eq!(language_extensions("typescript"), Some(&["ts", "tsx"][..]));
556        assert_eq!(language_extensions("python"), Some(&["py", "pyi"][..]));
557        assert_eq!(language_extensions("unknown"), None);
558    }
559}