Skip to main content

sift_core/
search.rs

1//! Naive full-corpus search: `ignore::WalkBuilder` + byte line scan + `regex_automata::meta::Regex`.
2
3use std::collections::HashSet;
4use std::fs::File;
5use std::io::{BufRead, BufReader};
6use std::path::{Path, PathBuf};
7use std::sync::OnceLock;
8
9use bitflags::bitflags;
10use ignore::WalkBuilder;
11use rayon::prelude::*;
12use regex_automata::{meta::Regex, Input};
13
14use crate::planner::TrigramPlan;
15use crate::verify;
16use crate::Index;
17
18static PARALLEL_MIN_FILES: OnceLock<usize> = OnceLock::new();
19
20#[must_use]
21pub fn parallel_candidate_min_files() -> usize {
22    *PARALLEL_MIN_FILES.get_or_init(|| {
23        let cpus = std::thread::available_parallelism()
24            .map(std::num::NonZeroUsize::get)
25            .unwrap_or(1);
26        let rayon_threads = std::env::var("RAYON_NUM_THREADS")
27            .ok()
28            .and_then(|s| s.parse::<usize>().ok());
29        parallel_scan_min_files_inner(cpus, rayon_threads)
30    })
31}
32
33fn parallel_scan_min_files_inner(cpus: usize, rayon_threads: Option<usize>) -> usize {
34    let effective = rayon_threads
35        .filter(|&n| n > 0)
36        .map_or(cpus, |rt| rt.min(cpus))
37        .max(1);
38    if effective <= 1 {
39        usize::MAX
40    } else {
41        effective
42    }
43}
44
45bitflags! {
46    #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
47    pub struct SearchMatchFlags: u8 {
48        const CASE_INSENSITIVE = 1 << 0;
49        const INVERT_MATCH = 1 << 1;
50        const FIXED_STRINGS = 1 << 2;
51        const WORD_REGEXP = 1 << 3;
52        const LINE_REGEXP = 1 << 4;
53        const ONLY_MATCHING = 1 << 5;
54    }
55}
56
57#[derive(Debug, Clone, Copy, Default)]
58pub struct SearchOptions {
59    pub flags: SearchMatchFlags,
60    pub max_results: Option<usize>,
61}
62
63impl SearchOptions {
64    #[must_use]
65    pub const fn case_insensitive(self) -> bool {
66        self.flags.contains(SearchMatchFlags::CASE_INSENSITIVE)
67    }
68
69    #[must_use]
70    pub const fn invert_match(self) -> bool {
71        self.flags.contains(SearchMatchFlags::INVERT_MATCH)
72    }
73
74    #[must_use]
75    pub const fn fixed_strings(self) -> bool {
76        self.flags.contains(SearchMatchFlags::FIXED_STRINGS)
77    }
78
79    #[must_use]
80    pub const fn word_regexp(self) -> bool {
81        self.flags.contains(SearchMatchFlags::WORD_REGEXP)
82    }
83
84    #[must_use]
85    pub const fn line_regexp(self) -> bool {
86        self.flags.contains(SearchMatchFlags::LINE_REGEXP)
87    }
88
89    #[must_use]
90    pub const fn only_matching(self) -> bool {
91        self.flags.contains(SearchMatchFlags::ONLY_MATCHING)
92    }
93
94    #[must_use]
95    pub const fn precludes_trigram_index(self) -> bool {
96        self.case_insensitive() || self.invert_match() || self.word_regexp() || self.line_regexp()
97    }
98}
99
100#[derive(Debug, Clone, PartialEq, Eq)]
101pub struct Match {
102    pub file: PathBuf,
103    pub line: usize,
104    pub text: String,
105}
106
107#[derive(Debug, Clone)]
108pub struct CompiledSearch {
109    re: Regex,
110    opts: SearchOptions,
111    patterns: Vec<String>,
112    plan: TrigramPlan,
113    substring_literals: Option<Vec<Vec<u8>>>,
114}
115
116impl CompiledSearch {
117    /// Compile patterns and options once.
118    ///
119    /// # Errors
120    ///
121    /// Returns [`crate::Error::EmptyPatterns`] if `patterns` is empty, or [`crate::Error::Regex`]
122    /// if compilation fails.
123    pub fn new(patterns: &[String], opts: SearchOptions) -> crate::Result<Self> {
124        if patterns.is_empty() {
125            return Err(crate::Error::EmptyPatterns);
126        }
127        let re = verify::compile_search_pattern(patterns, &opts)?;
128        let plan = TrigramPlan::for_patterns(patterns, &opts);
129        let substring_literals = if opts.fixed_strings()
130            && !opts.case_insensitive()
131            && !opts.word_regexp()
132            && !opts.line_regexp()
133        {
134            Some(patterns.iter().map(|p| p.as_bytes().to_vec()).collect())
135        } else {
136            None
137        };
138        Ok(Self {
139            re,
140            opts,
141            patterns: patterns.to_vec(),
142            plan,
143            substring_literals,
144        })
145    }
146
147    /// Search using an open [`Index`] (trigram narrowing when applicable).
148    ///
149    /// # Errors
150    ///
151    /// Same as [`Self::search_walk`].
152    pub fn search_index(&self, index: &Index) -> crate::Result<Vec<Match>> {
153        match &self.plan {
154            TrigramPlan::FullScan => {
155                search_files_impl(self, &index.root, Some(index.files.as_slice()))
156            }
157            TrigramPlan::Narrow { arms } => {
158                let cands = index.candidate_file_ids(arms.as_slice());
159                if cands.is_empty() {
160                    return Ok(Vec::new());
161                }
162                let paths: Vec<PathBuf> = cands
163                    .iter()
164                    .filter_map(|&id| index.files.get(id as usize).cloned())
165                    .collect();
166                search_files_impl(self, &index.root, Some(&paths))
167            }
168        }
169    }
170
171    /// Walk `root` with ignore rules (or scan only `candidates` when `Some`).
172    ///
173    /// # Errors
174    ///
175    /// Returns [`crate::Error::Io`] if the corpus root cannot be canonicalized, or [`crate::Error::Ignore`]
176    /// for directory walk failures.
177    pub fn search_walk(
178        &self,
179        root: &Path,
180        candidates: Option<&[PathBuf]>,
181    ) -> crate::Result<Vec<Match>> {
182        let root = root.canonicalize()?;
183        search_files_impl(self, &root, candidates)
184    }
185
186    #[must_use]
187    pub fn patterns(&self) -> &[String] {
188        &self.patterns
189    }
190}
191
192fn search_files_impl(
193    compiled: &CompiledSearch,
194    root: &Path,
195    candidates: Option<&[PathBuf]>,
196) -> crate::Result<Vec<Match>> {
197    if let Some(set) = candidates {
198        if set.is_empty() {
199            return Ok(Vec::new());
200        }
201    }
202    let mut out = Vec::new();
203    let mut budget = compiled.opts.max_results;
204
205    if let Some(set) = candidates {
206        let sorted = set.len() <= 1 || set.windows(2).all(|w| w[0] <= w[1]);
207        if compiled.opts.max_results.is_none() && set.len() >= parallel_candidate_min_files() {
208            let paths: Vec<PathBuf> = if sorted {
209                set.to_vec()
210            } else {
211                let mut v = set.to_vec();
212                v.sort();
213                v
214            };
215            return Ok(parallel_scan_candidate_files(compiled, root, &paths));
216        }
217        if sorted {
218            'subset: for display in set {
219                if budget == Some(0) {
220                    break 'subset;
221                }
222                let path = root.join(display);
223                if !path.is_file() {
224                    continue;
225                }
226                if scan_lines(display, &path, compiled, &mut budget, &mut out) {
227                    break 'subset;
228                }
229            }
230        } else {
231            let mut paths: Vec<PathBuf> = set.to_vec();
232            paths.sort();
233            'subset: for display in paths {
234                if budget == Some(0) {
235                    break 'subset;
236                }
237                let path = root.join(&display);
238                if !path.is_file() {
239                    continue;
240                }
241                if scan_lines(&display, &path, compiled, &mut budget, &mut out) {
242                    break 'subset;
243                }
244            }
245        }
246        return Ok(out);
247    }
248
249    let walker = WalkBuilder::new(root).follow_links(false).build();
250
251    'files: for entry in walker {
252        let entry = entry.map_err(crate::Error::Ignore)?;
253        if !entry.path().is_file() {
254            continue;
255        }
256        let path = entry.path();
257        let display = path.strip_prefix(root).unwrap_or(path).to_path_buf();
258
259        if scan_lines(&display, path, compiled, &mut budget, &mut out) {
260            break 'files;
261        }
262    }
263
264    Ok(out)
265}
266
267fn parallel_scan_candidate_files(
268    compiled: &CompiledSearch,
269    root: &Path,
270    paths: &[PathBuf],
271) -> Vec<Match> {
272    let chunks: Vec<Vec<Match>> = paths
273        .par_iter()
274        .map(|display| {
275            let path = root.join(display);
276            if !path.is_file() {
277                return Vec::new();
278            }
279            let mut out = Vec::new();
280            let mut budget = None;
281            let _ = scan_lines(display, &path, compiled, &mut budget, &mut out);
282            out
283        })
284        .collect();
285    let mut matches: Vec<Match> = chunks.into_iter().flatten().collect();
286    matches.sort_by(|a, b| {
287        a.file
288            .cmp(&b.file)
289            .then_with(|| a.line.cmp(&b.line))
290            .then_with(|| a.text.cmp(&b.text))
291    });
292    matches
293}
294
295fn bytes_contains_any(line: &[u8], needles: &[Vec<u8>]) -> bool {
296    needles
297        .iter()
298        .any(|n| memchr::memmem::find(line, n).is_some())
299}
300
301fn scan_lines(
302    display: &Path,
303    path: &Path,
304    compiled: &CompiledSearch,
305    budget: &mut Option<usize>,
306    out: &mut Vec<Match>,
307) -> bool {
308    let re = &compiled.re;
309    let opts = compiled.opts;
310    let literals = compiled.substring_literals.as_deref();
311    let Ok(file) = File::open(path) else {
312        return false;
313    };
314    let mut reader = BufReader::new(file);
315    let mut line = Vec::new();
316    let mut line_no = 0usize;
317    let mut cache = re.create_cache();
318
319    loop {
320        if *budget == Some(0) {
321            return true;
322        }
323        line.clear();
324        match reader.read_until(b'\n', &mut line) {
325            Ok(0) | Err(_) => break,
326            Ok(n) => {
327                if n == 0 {
328                    break;
329                }
330            }
331        }
332        line_no += 1;
333        while line.len() > 1 && (line[line.len() - 1] == b'\n' || line[line.len() - 1] == b'\r') {
334            line.pop();
335        }
336        if line.len() == 1 && line[0] == b'\n' {
337            line.pop();
338        }
339
340        let matched = match literals {
341            Some(needles) if !opts.only_matching() => bytes_contains_any(&line, needles),
342            Some(needles) if !bytes_contains_any(&line, needles) => false,
343            _ => re.search_with(&mut cache, &Input::new(&line)).is_some(),
344        };
345
346        let take = if opts.invert_match() {
347            !matched
348        } else {
349            matched
350        };
351
352        if !take {
353            continue;
354        }
355
356        if opts.only_matching() && !opts.invert_match() {
357            let mut input = Input::new(&line);
358            while let Some(m) = re.search_with(&mut cache, &input) {
359                if *budget == Some(0) {
360                    return true;
361                }
362                let start = m.span().start;
363                let end = m.span().end;
364                let matched_text = String::from_utf8_lossy(&line[start..end]).into_owned();
365                out.push(Match {
366                    file: display.to_path_buf(),
367                    line: line_no,
368                    text: matched_text,
369                });
370                if let Some(b) = *budget {
371                    *budget = Some(b - 1);
372                }
373                input = Input::new(&line).range(end..);
374            }
375        } else {
376            out.push(Match {
377                file: display.to_path_buf(),
378                line: line_no,
379                text: String::from_utf8_lossy(&line).into_owned(),
380            });
381            if let Some(b) = *budget {
382                *budget = Some(b - 1);
383            }
384        }
385    }
386    false
387}
388
389/// All readable file paths under `root` (respecting ignore rules), relative to `root`.
390///
391/// # Errors
392///
393/// Propagates [`crate::Error::Io`] and [`crate::Error::Ignore`] from canonicalization / walking.
394pub fn walk_file_paths(root: &Path) -> crate::Result<HashSet<PathBuf>> {
395    let root = root.canonicalize()?;
396    let mut set = HashSet::new();
397    let walker = WalkBuilder::new(&root).follow_links(false).build();
398    for entry in walker {
399        let entry = entry.map_err(crate::Error::Ignore)?;
400        if !entry.path().is_file() {
401            continue;
402        }
403        let path = entry.path();
404        let display = path.strip_prefix(&root).unwrap_or(path).to_path_buf();
405        set.insert(display);
406    }
407    Ok(set)
408}
409
410#[cfg(test)]
411mod tests {
412    use super::*;
413    use std::fs;
414
415    fn tmp_corpus(name: &str) -> PathBuf {
416        std::env::temp_dir().join(format!("sift-search-{name}-{}", std::process::id()))
417    }
418
419    #[test]
420    fn empty_patterns_rejected() {
421        assert!(matches!(
422            CompiledSearch::new(&[], SearchOptions::default()),
423            Err(crate::Error::EmptyPatterns)
424        ));
425    }
426
427    #[test]
428    fn parallel_scan_threshold_rayon_one_disables() {
429        assert_eq!(parallel_scan_min_files_inner(8, Some(1)), usize::MAX);
430    }
431
432    #[test]
433    fn parallel_scan_threshold_caps_at_cpus() {
434        assert_eq!(parallel_scan_min_files_inner(4, Some(16)), 4);
435    }
436
437    #[test]
438    fn parallel_scan_threshold_uses_rayon_when_lower() {
439        assert_eq!(parallel_scan_min_files_inner(8, Some(4)), 4);
440    }
441
442    #[test]
443    fn parallel_scan_threshold_single_cpu_no_parallel() {
444        assert_eq!(parallel_scan_min_files_inner(1, None), usize::MAX);
445    }
446
447    #[test]
448    fn parallel_scan_threshold_zero_rayon_ignored() {
449        assert_eq!(parallel_scan_min_files_inner(8, Some(0)), 8);
450    }
451
452    #[test]
453    fn fixed_string_substring_fast_path_matches_plain_regex() {
454        let dir = tmp_corpus("fixed-fast");
455        let _ = fs::remove_dir_all(&dir);
456        fs::create_dir_all(&dir).unwrap();
457        fs::write(dir.join("a.txt"), "alpha beta\n").unwrap();
458        fs::write(dir.join("b.txt"), "gamma\n").unwrap();
459
460        let pat = vec!["beta".to_string()];
461        let opts_fix = SearchOptions {
462            flags: SearchMatchFlags::FIXED_STRINGS,
463            max_results: None,
464        };
465        let q_fix = CompiledSearch::new(&pat, opts_fix).unwrap();
466        assert!(q_fix.substring_literals.is_some());
467
468        let q_re = CompiledSearch::new(&pat, SearchOptions::default()).unwrap();
469        assert!(q_re.substring_literals.is_none());
470
471        assert_eq!(
472            q_fix.search_walk(&dir, None).unwrap(),
473            q_re.search_walk(&dir, None).unwrap()
474        );
475    }
476
477    #[test]
478    fn alternation_finds_both_branches() {
479        let dir = tmp_corpus("regex-or");
480        let _ = fs::remove_dir_all(&dir);
481        fs::create_dir_all(&dir).unwrap();
482        fs::write(dir.join("a.txt"), "x foo y\n").unwrap();
483        fs::write(dir.join("b.txt"), "x bar z\n").unwrap();
484        let pat = vec![r"foo|bar".to_string()];
485        let q = CompiledSearch::new(&pat, SearchOptions::default()).unwrap();
486        let hits = q.search_walk(&dir, None).unwrap();
487        assert_eq!(hits.len(), 2);
488    }
489
490    #[test]
491    fn search_finds_across_two_files() {
492        let dir = tmp_corpus("two-files");
493        let _ = fs::remove_dir_all(&dir);
494        fs::create_dir_all(dir.join("a")).unwrap();
495        fs::create_dir_all(dir.join("b")).unwrap();
496        fs::write(dir.join("a/x.txt"), "one\n").unwrap();
497        fs::write(dir.join("b/y.txt"), "two\n").unwrap();
498        let pat = vec!["o".to_string()];
499        let q = CompiledSearch::new(&pat, SearchOptions::default()).unwrap();
500        let hits = q.search_walk(&dir, None).unwrap();
501        assert_eq!(hits.len(), 2);
502    }
503
504    #[test]
505    fn unsorted_candidates_match_sorted() {
506        let dir = tmp_corpus("cand-order");
507        let _ = fs::remove_dir_all(&dir);
508        fs::create_dir_all(&dir).unwrap();
509        fs::write(dir.join("a.txt"), "hit\n").unwrap();
510        fs::write(dir.join("b.txt"), "hit\n").unwrap();
511        let pat = vec!["hit".to_string()];
512        let opts = SearchOptions::default();
513        let sorted = vec![PathBuf::from("a.txt"), PathBuf::from("b.txt")];
514        let unsorted = vec![PathBuf::from("b.txt"), PathBuf::from("a.txt")];
515        let q = CompiledSearch::new(&pat, opts).unwrap();
516        let a = q.search_walk(&dir, Some(&sorted)).unwrap();
517        let b = q.search_walk(&dir, Some(&unsorted)).unwrap();
518        assert_eq!(a, b);
519    }
520
521    #[test]
522    fn ignore_file_excludes_matches() {
523        let dir = tmp_corpus("ignore");
524        let _ = fs::remove_dir_all(&dir);
525        fs::create_dir_all(&dir).unwrap();
526        fs::write(dir.join(".ignore"), "*.skip\n").unwrap();
527        fs::write(dir.join("keep.txt"), "SECRET=1\n").unwrap();
528        fs::write(dir.join("x.skip"), "SECRET=2\n").unwrap();
529        let pat = vec!["SECRET".to_string()];
530        let q = CompiledSearch::new(&pat, SearchOptions::default()).unwrap();
531        let hits = q.search_walk(&dir, None).unwrap();
532        assert_eq!(hits.len(), 1);
533        assert!(hits[0].file.ends_with("keep.txt"));
534    }
535
536    #[test]
537    fn invert_match_selects_non_matching_lines() {
538        let dir = tmp_corpus("invert");
539        let _ = fs::remove_dir_all(&dir);
540        fs::create_dir_all(&dir).unwrap();
541        fs::write(dir.join("t.txt"), "aa\nbb\n").unwrap();
542        let opts = SearchOptions {
543            flags: SearchMatchFlags::INVERT_MATCH,
544            max_results: None,
545        };
546        let pat = vec!["aa".to_string()];
547        let q = CompiledSearch::new(&pat, opts).unwrap();
548        let hits = q.search_walk(&dir, None).unwrap();
549        assert_eq!(hits.len(), 1);
550        assert_eq!(hits[0].text, "bb");
551    }
552
553    #[test]
554    fn max_results_stops_after_n() {
555        let dir = tmp_corpus("max");
556        let _ = fs::remove_dir_all(&dir);
557        fs::create_dir_all(&dir).unwrap();
558        fs::write(dir.join("t.txt"), "a\na\na\n").unwrap();
559        let opts = SearchOptions {
560            flags: SearchMatchFlags::empty(),
561            max_results: Some(2),
562        };
563        let pat = vec!["a".to_string()];
564        let q = CompiledSearch::new(&pat, opts).unwrap();
565        let hits = q.search_walk(&dir, None).unwrap();
566        assert_eq!(hits.len(), 2);
567    }
568
569    #[test]
570    fn only_matching_emits_spans() {
571        let dir = tmp_corpus("only-o");
572        let _ = fs::remove_dir_all(&dir);
573        fs::create_dir_all(&dir).unwrap();
574        fs::write(dir.join("t.txt"), "foo bar foo\n").unwrap();
575        let opts = SearchOptions {
576            flags: SearchMatchFlags::ONLY_MATCHING,
577            max_results: None,
578        };
579        let pat = vec!["foo".to_string()];
580        let q = CompiledSearch::new(&pat, opts).unwrap();
581        let hits = q.search_walk(&dir, None).unwrap();
582        assert_eq!(hits.len(), 2);
583        assert!(hits.iter().all(|m| m.text == "foo"));
584    }
585
586    #[test]
587    fn walk_file_paths_lists_expected_files() {
588        let dir = tmp_corpus("walk");
589        let _ = fs::remove_dir_all(&dir);
590        fs::create_dir_all(&dir).unwrap();
591        fs::write(dir.join(".ignore"), "x\n").unwrap();
592        fs::write(dir.join("a.rs"), "").unwrap();
593        fs::write(dir.join("x"), "").unwrap();
594        let paths = walk_file_paths(&dir).unwrap();
595        assert!(paths.iter().any(|p| p.ends_with("a.rs")));
596        assert!(!paths
597            .iter()
598            .any(|p| p.as_path() == std::path::Path::new("x")));
599    }
600}