1use std::collections::HashSet;
4use std::fs::File;
5use std::io::{BufRead, BufReader};
6use std::path::{Path, PathBuf};
7use std::sync::OnceLock;
8
9use bitflags::bitflags;
10use ignore::WalkBuilder;
11use rayon::prelude::*;
12use regex_automata::{meta::Regex, Input};
13
14use crate::planner::TrigramPlan;
15use crate::verify;
16use crate::Index;
17
18static PARALLEL_MIN_FILES: OnceLock<usize> = OnceLock::new();
19
20#[must_use]
21pub fn parallel_candidate_min_files() -> usize {
22 *PARALLEL_MIN_FILES.get_or_init(|| {
23 let cpus = std::thread::available_parallelism()
24 .map(std::num::NonZeroUsize::get)
25 .unwrap_or(1);
26 let rayon_threads = std::env::var("RAYON_NUM_THREADS")
27 .ok()
28 .and_then(|s| s.parse::<usize>().ok());
29 parallel_scan_min_files_inner(cpus, rayon_threads)
30 })
31}
32
33fn parallel_scan_min_files_inner(cpus: usize, rayon_threads: Option<usize>) -> usize {
34 let effective = rayon_threads
35 .filter(|&n| n > 0)
36 .map_or(cpus, |rt| rt.min(cpus))
37 .max(1);
38 if effective <= 1 {
39 usize::MAX
40 } else {
41 effective
42 }
43}
44
45bitflags! {
46 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
47 pub struct SearchMatchFlags: u8 {
48 const CASE_INSENSITIVE = 1 << 0;
49 const INVERT_MATCH = 1 << 1;
50 const FIXED_STRINGS = 1 << 2;
51 const WORD_REGEXP = 1 << 3;
52 const LINE_REGEXP = 1 << 4;
53 const ONLY_MATCHING = 1 << 5;
54 }
55}
56
57#[derive(Debug, Clone, Copy, Default)]
58pub struct SearchOptions {
59 pub flags: SearchMatchFlags,
60 pub max_results: Option<usize>,
61}
62
63impl SearchOptions {
64 #[must_use]
65 pub const fn case_insensitive(self) -> bool {
66 self.flags.contains(SearchMatchFlags::CASE_INSENSITIVE)
67 }
68
69 #[must_use]
70 pub const fn invert_match(self) -> bool {
71 self.flags.contains(SearchMatchFlags::INVERT_MATCH)
72 }
73
74 #[must_use]
75 pub const fn fixed_strings(self) -> bool {
76 self.flags.contains(SearchMatchFlags::FIXED_STRINGS)
77 }
78
79 #[must_use]
80 pub const fn word_regexp(self) -> bool {
81 self.flags.contains(SearchMatchFlags::WORD_REGEXP)
82 }
83
84 #[must_use]
85 pub const fn line_regexp(self) -> bool {
86 self.flags.contains(SearchMatchFlags::LINE_REGEXP)
87 }
88
89 #[must_use]
90 pub const fn only_matching(self) -> bool {
91 self.flags.contains(SearchMatchFlags::ONLY_MATCHING)
92 }
93
94 #[must_use]
95 pub const fn precludes_trigram_index(self) -> bool {
96 self.case_insensitive() || self.invert_match() || self.word_regexp() || self.line_regexp()
97 }
98}
99
100#[derive(Debug, Clone, PartialEq, Eq)]
101pub struct Match {
102 pub file: PathBuf,
103 pub line: usize,
104 pub text: String,
105}
106
107#[derive(Debug, Clone)]
108pub struct CompiledSearch {
109 re: Regex,
110 opts: SearchOptions,
111 patterns: Vec<String>,
112 plan: TrigramPlan,
113 substring_literals: Option<Vec<Vec<u8>>>,
114}
115
116impl CompiledSearch {
117 pub fn new(patterns: &[String], opts: SearchOptions) -> crate::Result<Self> {
124 if patterns.is_empty() {
125 return Err(crate::Error::EmptyPatterns);
126 }
127 let re = verify::compile_search_pattern(patterns, &opts)?;
128 let plan = TrigramPlan::for_patterns(patterns, &opts);
129 let substring_literals = if opts.fixed_strings()
130 && !opts.case_insensitive()
131 && !opts.word_regexp()
132 && !opts.line_regexp()
133 {
134 Some(patterns.iter().map(|p| p.as_bytes().to_vec()).collect())
135 } else {
136 None
137 };
138 Ok(Self {
139 re,
140 opts,
141 patterns: patterns.to_vec(),
142 plan,
143 substring_literals,
144 })
145 }
146
147 pub fn search_index(&self, index: &Index) -> crate::Result<Vec<Match>> {
153 match &self.plan {
154 TrigramPlan::FullScan => {
155 search_files_impl(self, &index.root, Some(index.files.as_slice()))
156 }
157 TrigramPlan::Narrow { arms } => {
158 let cands = index.candidate_file_ids(arms.as_slice());
159 if cands.is_empty() {
160 return Ok(Vec::new());
161 }
162 let paths: Vec<PathBuf> = cands
163 .iter()
164 .filter_map(|&id| index.files.get(id as usize).cloned())
165 .collect();
166 search_files_impl(self, &index.root, Some(&paths))
167 }
168 }
169 }
170
171 pub fn search_walk(
178 &self,
179 root: &Path,
180 candidates: Option<&[PathBuf]>,
181 ) -> crate::Result<Vec<Match>> {
182 let root = root.canonicalize()?;
183 search_files_impl(self, &root, candidates)
184 }
185
186 #[must_use]
187 pub fn patterns(&self) -> &[String] {
188 &self.patterns
189 }
190}
191
192fn search_files_impl(
193 compiled: &CompiledSearch,
194 root: &Path,
195 candidates: Option<&[PathBuf]>,
196) -> crate::Result<Vec<Match>> {
197 if let Some(set) = candidates {
198 if set.is_empty() {
199 return Ok(Vec::new());
200 }
201 }
202 let mut out = Vec::new();
203 let mut budget = compiled.opts.max_results;
204
205 if let Some(set) = candidates {
206 let sorted = set.len() <= 1 || set.windows(2).all(|w| w[0] <= w[1]);
207 if compiled.opts.max_results.is_none() && set.len() >= parallel_candidate_min_files() {
208 let paths: Vec<PathBuf> = if sorted {
209 set.to_vec()
210 } else {
211 let mut v = set.to_vec();
212 v.sort();
213 v
214 };
215 return Ok(parallel_scan_candidate_files(compiled, root, &paths));
216 }
217 if sorted {
218 'subset: for display in set {
219 if budget == Some(0) {
220 break 'subset;
221 }
222 let path = root.join(display);
223 if !path.is_file() {
224 continue;
225 }
226 if scan_lines(display, &path, compiled, &mut budget, &mut out) {
227 break 'subset;
228 }
229 }
230 } else {
231 let mut paths: Vec<PathBuf> = set.to_vec();
232 paths.sort();
233 'subset: for display in paths {
234 if budget == Some(0) {
235 break 'subset;
236 }
237 let path = root.join(&display);
238 if !path.is_file() {
239 continue;
240 }
241 if scan_lines(&display, &path, compiled, &mut budget, &mut out) {
242 break 'subset;
243 }
244 }
245 }
246 return Ok(out);
247 }
248
249 let walker = WalkBuilder::new(root).follow_links(false).build();
250
251 'files: for entry in walker {
252 let entry = entry.map_err(crate::Error::Ignore)?;
253 if !entry.path().is_file() {
254 continue;
255 }
256 let path = entry.path();
257 let display = path.strip_prefix(root).unwrap_or(path).to_path_buf();
258
259 if scan_lines(&display, path, compiled, &mut budget, &mut out) {
260 break 'files;
261 }
262 }
263
264 Ok(out)
265}
266
267fn parallel_scan_candidate_files(
268 compiled: &CompiledSearch,
269 root: &Path,
270 paths: &[PathBuf],
271) -> Vec<Match> {
272 let chunks: Vec<Vec<Match>> = paths
273 .par_iter()
274 .map(|display| {
275 let path = root.join(display);
276 if !path.is_file() {
277 return Vec::new();
278 }
279 let mut out = Vec::new();
280 let mut budget = None;
281 let _ = scan_lines(display, &path, compiled, &mut budget, &mut out);
282 out
283 })
284 .collect();
285 let mut matches: Vec<Match> = chunks.into_iter().flatten().collect();
286 matches.sort_by(|a, b| {
287 a.file
288 .cmp(&b.file)
289 .then_with(|| a.line.cmp(&b.line))
290 .then_with(|| a.text.cmp(&b.text))
291 });
292 matches
293}
294
295fn bytes_contains_any(line: &[u8], needles: &[Vec<u8>]) -> bool {
296 needles
297 .iter()
298 .any(|n| memchr::memmem::find(line, n).is_some())
299}
300
301fn scan_lines(
302 display: &Path,
303 path: &Path,
304 compiled: &CompiledSearch,
305 budget: &mut Option<usize>,
306 out: &mut Vec<Match>,
307) -> bool {
308 let re = &compiled.re;
309 let opts = compiled.opts;
310 let literals = compiled.substring_literals.as_deref();
311 let Ok(file) = File::open(path) else {
312 return false;
313 };
314 let mut reader = BufReader::new(file);
315 let mut line = Vec::new();
316 let mut line_no = 0usize;
317 let mut cache = re.create_cache();
318
319 loop {
320 if *budget == Some(0) {
321 return true;
322 }
323 line.clear();
324 match reader.read_until(b'\n', &mut line) {
325 Ok(0) | Err(_) => break,
326 Ok(n) => {
327 if n == 0 {
328 break;
329 }
330 }
331 }
332 line_no += 1;
333 while line.len() > 1 && (line[line.len() - 1] == b'\n' || line[line.len() - 1] == b'\r') {
334 line.pop();
335 }
336 if line.len() == 1 && line[0] == b'\n' {
337 line.pop();
338 }
339
340 let matched = match literals {
341 Some(needles) if !opts.only_matching() => bytes_contains_any(&line, needles),
342 Some(needles) if !bytes_contains_any(&line, needles) => false,
343 _ => re.search_with(&mut cache, &Input::new(&line)).is_some(),
344 };
345
346 let take = if opts.invert_match() {
347 !matched
348 } else {
349 matched
350 };
351
352 if !take {
353 continue;
354 }
355
356 if opts.only_matching() && !opts.invert_match() {
357 let mut input = Input::new(&line);
358 while let Some(m) = re.search_with(&mut cache, &input) {
359 if *budget == Some(0) {
360 return true;
361 }
362 let start = m.span().start;
363 let end = m.span().end;
364 let matched_text = String::from_utf8_lossy(&line[start..end]).into_owned();
365 out.push(Match {
366 file: display.to_path_buf(),
367 line: line_no,
368 text: matched_text,
369 });
370 if let Some(b) = *budget {
371 *budget = Some(b - 1);
372 }
373 input = Input::new(&line).range(end..);
374 }
375 } else {
376 out.push(Match {
377 file: display.to_path_buf(),
378 line: line_no,
379 text: String::from_utf8_lossy(&line).into_owned(),
380 });
381 if let Some(b) = *budget {
382 *budget = Some(b - 1);
383 }
384 }
385 }
386 false
387}
388
389pub fn walk_file_paths(root: &Path) -> crate::Result<HashSet<PathBuf>> {
395 let root = root.canonicalize()?;
396 let mut set = HashSet::new();
397 let walker = WalkBuilder::new(&root).follow_links(false).build();
398 for entry in walker {
399 let entry = entry.map_err(crate::Error::Ignore)?;
400 if !entry.path().is_file() {
401 continue;
402 }
403 let path = entry.path();
404 let display = path.strip_prefix(&root).unwrap_or(path).to_path_buf();
405 set.insert(display);
406 }
407 Ok(set)
408}
409
410#[cfg(test)]
411mod tests {
412 use super::*;
413 use std::fs;
414
415 fn tmp_corpus(name: &str) -> PathBuf {
416 std::env::temp_dir().join(format!("sift-search-{name}-{}", std::process::id()))
417 }
418
419 #[test]
420 fn empty_patterns_rejected() {
421 assert!(matches!(
422 CompiledSearch::new(&[], SearchOptions::default()),
423 Err(crate::Error::EmptyPatterns)
424 ));
425 }
426
427 #[test]
428 fn parallel_scan_threshold_rayon_one_disables() {
429 assert_eq!(parallel_scan_min_files_inner(8, Some(1)), usize::MAX);
430 }
431
432 #[test]
433 fn parallel_scan_threshold_caps_at_cpus() {
434 assert_eq!(parallel_scan_min_files_inner(4, Some(16)), 4);
435 }
436
437 #[test]
438 fn parallel_scan_threshold_uses_rayon_when_lower() {
439 assert_eq!(parallel_scan_min_files_inner(8, Some(4)), 4);
440 }
441
442 #[test]
443 fn parallel_scan_threshold_single_cpu_no_parallel() {
444 assert_eq!(parallel_scan_min_files_inner(1, None), usize::MAX);
445 }
446
447 #[test]
448 fn parallel_scan_threshold_zero_rayon_ignored() {
449 assert_eq!(parallel_scan_min_files_inner(8, Some(0)), 8);
450 }
451
452 #[test]
453 fn fixed_string_substring_fast_path_matches_plain_regex() {
454 let dir = tmp_corpus("fixed-fast");
455 let _ = fs::remove_dir_all(&dir);
456 fs::create_dir_all(&dir).unwrap();
457 fs::write(dir.join("a.txt"), "alpha beta\n").unwrap();
458 fs::write(dir.join("b.txt"), "gamma\n").unwrap();
459
460 let pat = vec!["beta".to_string()];
461 let opts_fix = SearchOptions {
462 flags: SearchMatchFlags::FIXED_STRINGS,
463 max_results: None,
464 };
465 let q_fix = CompiledSearch::new(&pat, opts_fix).unwrap();
466 assert!(q_fix.substring_literals.is_some());
467
468 let q_re = CompiledSearch::new(&pat, SearchOptions::default()).unwrap();
469 assert!(q_re.substring_literals.is_none());
470
471 assert_eq!(
472 q_fix.search_walk(&dir, None).unwrap(),
473 q_re.search_walk(&dir, None).unwrap()
474 );
475 }
476
477 #[test]
478 fn alternation_finds_both_branches() {
479 let dir = tmp_corpus("regex-or");
480 let _ = fs::remove_dir_all(&dir);
481 fs::create_dir_all(&dir).unwrap();
482 fs::write(dir.join("a.txt"), "x foo y\n").unwrap();
483 fs::write(dir.join("b.txt"), "x bar z\n").unwrap();
484 let pat = vec![r"foo|bar".to_string()];
485 let q = CompiledSearch::new(&pat, SearchOptions::default()).unwrap();
486 let hits = q.search_walk(&dir, None).unwrap();
487 assert_eq!(hits.len(), 2);
488 }
489
490 #[test]
491 fn search_finds_across_two_files() {
492 let dir = tmp_corpus("two-files");
493 let _ = fs::remove_dir_all(&dir);
494 fs::create_dir_all(dir.join("a")).unwrap();
495 fs::create_dir_all(dir.join("b")).unwrap();
496 fs::write(dir.join("a/x.txt"), "one\n").unwrap();
497 fs::write(dir.join("b/y.txt"), "two\n").unwrap();
498 let pat = vec!["o".to_string()];
499 let q = CompiledSearch::new(&pat, SearchOptions::default()).unwrap();
500 let hits = q.search_walk(&dir, None).unwrap();
501 assert_eq!(hits.len(), 2);
502 }
503
504 #[test]
505 fn unsorted_candidates_match_sorted() {
506 let dir = tmp_corpus("cand-order");
507 let _ = fs::remove_dir_all(&dir);
508 fs::create_dir_all(&dir).unwrap();
509 fs::write(dir.join("a.txt"), "hit\n").unwrap();
510 fs::write(dir.join("b.txt"), "hit\n").unwrap();
511 let pat = vec!["hit".to_string()];
512 let opts = SearchOptions::default();
513 let sorted = vec![PathBuf::from("a.txt"), PathBuf::from("b.txt")];
514 let unsorted = vec![PathBuf::from("b.txt"), PathBuf::from("a.txt")];
515 let q = CompiledSearch::new(&pat, opts).unwrap();
516 let a = q.search_walk(&dir, Some(&sorted)).unwrap();
517 let b = q.search_walk(&dir, Some(&unsorted)).unwrap();
518 assert_eq!(a, b);
519 }
520
521 #[test]
522 fn ignore_file_excludes_matches() {
523 let dir = tmp_corpus("ignore");
524 let _ = fs::remove_dir_all(&dir);
525 fs::create_dir_all(&dir).unwrap();
526 fs::write(dir.join(".ignore"), "*.skip\n").unwrap();
527 fs::write(dir.join("keep.txt"), "SECRET=1\n").unwrap();
528 fs::write(dir.join("x.skip"), "SECRET=2\n").unwrap();
529 let pat = vec!["SECRET".to_string()];
530 let q = CompiledSearch::new(&pat, SearchOptions::default()).unwrap();
531 let hits = q.search_walk(&dir, None).unwrap();
532 assert_eq!(hits.len(), 1);
533 assert!(hits[0].file.ends_with("keep.txt"));
534 }
535
536 #[test]
537 fn invert_match_selects_non_matching_lines() {
538 let dir = tmp_corpus("invert");
539 let _ = fs::remove_dir_all(&dir);
540 fs::create_dir_all(&dir).unwrap();
541 fs::write(dir.join("t.txt"), "aa\nbb\n").unwrap();
542 let opts = SearchOptions {
543 flags: SearchMatchFlags::INVERT_MATCH,
544 max_results: None,
545 };
546 let pat = vec!["aa".to_string()];
547 let q = CompiledSearch::new(&pat, opts).unwrap();
548 let hits = q.search_walk(&dir, None).unwrap();
549 assert_eq!(hits.len(), 1);
550 assert_eq!(hits[0].text, "bb");
551 }
552
553 #[test]
554 fn max_results_stops_after_n() {
555 let dir = tmp_corpus("max");
556 let _ = fs::remove_dir_all(&dir);
557 fs::create_dir_all(&dir).unwrap();
558 fs::write(dir.join("t.txt"), "a\na\na\n").unwrap();
559 let opts = SearchOptions {
560 flags: SearchMatchFlags::empty(),
561 max_results: Some(2),
562 };
563 let pat = vec!["a".to_string()];
564 let q = CompiledSearch::new(&pat, opts).unwrap();
565 let hits = q.search_walk(&dir, None).unwrap();
566 assert_eq!(hits.len(), 2);
567 }
568
569 #[test]
570 fn only_matching_emits_spans() {
571 let dir = tmp_corpus("only-o");
572 let _ = fs::remove_dir_all(&dir);
573 fs::create_dir_all(&dir).unwrap();
574 fs::write(dir.join("t.txt"), "foo bar foo\n").unwrap();
575 let opts = SearchOptions {
576 flags: SearchMatchFlags::ONLY_MATCHING,
577 max_results: None,
578 };
579 let pat = vec!["foo".to_string()];
580 let q = CompiledSearch::new(&pat, opts).unwrap();
581 let hits = q.search_walk(&dir, None).unwrap();
582 assert_eq!(hits.len(), 2);
583 assert!(hits.iter().all(|m| m.text == "foo"));
584 }
585
586 #[test]
587 fn walk_file_paths_lists_expected_files() {
588 let dir = tmp_corpus("walk");
589 let _ = fs::remove_dir_all(&dir);
590 fs::create_dir_all(&dir).unwrap();
591 fs::write(dir.join(".ignore"), "x\n").unwrap();
592 fs::write(dir.join("a.rs"), "").unwrap();
593 fs::write(dir.join("x"), "").unwrap();
594 let paths = walk_file_paths(&dir).unwrap();
595 assert!(paths.iter().any(|p| p.ends_with("a.rs")));
596 assert!(!paths
597 .iter()
598 .any(|p| p.as_path() == std::path::Path::new("x")));
599 }
600}