1use crate::{
9 BigramFilter, BigramOverlay,
10 bigram_query::{fuzzy_to_bigram_query, regex_to_bigram_query},
11 constraints::apply_constraints,
12 extract_bigrams,
13 sort_buffer::sort_with_buffer,
14 types::{ContentCacheBudget, FileItem},
15};
16use aho_corasick::AhoCorasick;
17pub use fff_grep::{
18 Searcher, SearcherBuilder, Sink, SinkMatch,
19 lines::{self, LineStep},
20 matcher::{Match, Matcher, NoError},
21};
22use fff_query_parser::{Constraint, FFFQuery, GrepConfig, QueryParser};
23use rayon::prelude::*;
24use smallvec::SmallVec;
25use std::path::Path;
26use std::sync::Arc;
27use std::sync::atomic::{AtomicBool, Ordering};
28use tracing::Level;
29
30pub fn is_definition_line(line: &str) -> bool {
39 let s = line.trim_start().as_bytes();
40 let s = skip_modifiers(s);
41 is_definition_keyword(s)
42}
43
44const MODIFIERS: &[&[u8]] = &[
47 b"pub",
48 b"export",
49 b"default",
50 b"async",
51 b"abstract",
52 b"unsafe",
53 b"static",
54 b"protected",
55 b"private",
56 b"public",
57];
58
59const DEF_KEYWORDS: &[&[u8]] = &[
61 b"struct",
62 b"fn",
63 b"enum",
64 b"trait",
65 b"impl",
66 b"class",
67 b"interface",
68 b"function",
69 b"def",
70 b"func",
71 b"type",
72 b"module",
73 b"object",
74];
75
76fn skip_modifiers(mut s: &[u8]) -> &[u8] {
78 loop {
79 if s.starts_with(b"pub(")
81 && let Some(end) = s.iter().position(|&b| b == b')')
82 {
83 s = skip_ws(&s[end + 1..]);
84 continue;
85 }
86 let mut matched = false;
87 for &kw in MODIFIERS {
88 if s.starts_with(kw) {
89 let rest = &s[kw.len()..];
90 if rest.first().is_some_and(|b| b.is_ascii_whitespace()) {
91 s = skip_ws(rest);
92 matched = true;
93 break;
94 }
95 }
96 }
97 if !matched {
98 return s;
99 }
100 }
101}
102
103fn is_definition_keyword(s: &[u8]) -> bool {
105 for &kw in DEF_KEYWORDS {
106 if s.starts_with(kw) {
107 let after = s.get(kw.len());
108 if after.is_none_or(|b| !b.is_ascii_alphanumeric() && *b != b'_') {
110 return true;
111 }
112 }
113 }
114 false
115}
116
117#[inline]
119fn skip_ws(s: &[u8]) -> &[u8] {
120 let n = s
121 .iter()
122 .position(|b| !b.is_ascii_whitespace())
123 .unwrap_or(s.len());
124 &s[n..]
125}
126
127pub fn is_import_line(line: &str) -> bool {
132 let s = line.trim_start().as_bytes();
133 s.starts_with(b"import ")
134 || s.starts_with(b"import\t")
135 || (s.starts_with(b"from ") && s.get(5).is_some_and(|&b| b == b'\'' || b == b'"'))
136 || s.starts_with(b"use ")
137 || s.starts_with(b"use\t")
138 || starts_with_require(s)
139 || starts_with_include(s)
140}
141
142#[inline]
144fn starts_with_require(s: &[u8]) -> bool {
145 if !s.starts_with(b"require") {
146 return false;
147 }
148 let rest = &s[b"require".len()..];
149 rest.first() == Some(&b'(') || (rest.first() == Some(&b' ') && rest.get(1) == Some(&b'('))
150}
151
152#[inline]
154fn starts_with_include(s: &[u8]) -> bool {
155 if s.first() != Some(&b'#') {
156 return false;
157 }
158 let rest = skip_ws(&s[1..]);
159 rest.starts_with(b"include ") || rest.starts_with(b"include\t")
160}
161
162pub fn has_regex_metacharacters(text: &str) -> bool {
173 regex::escape(text) != text
174}
175
176#[inline]
181fn has_unescaped_newline_escape(text: &str) -> bool {
182 let bytes = text.as_bytes();
183 let mut i = 0;
184 while i < bytes.len().saturating_sub(1) {
185 if bytes[i] == b'\\' {
186 if bytes[i + 1] == b'n' {
187 let mut backslash_count = 1;
189 while backslash_count <= i && bytes[i - backslash_count] == b'\\' {
190 backslash_count += 1;
191 }
192 if backslash_count % 2 == 1 {
194 return true;
195 }
196 }
197 i += 2;
199 } else {
200 i += 1;
201 }
202 }
203 false
204}
205
206fn replace_unescaped_newline_escapes(text: &str) -> String {
211 let bytes = text.as_bytes();
212 let mut result = Vec::with_capacity(bytes.len());
213 let mut i = 0;
214 while i < bytes.len() {
215 if bytes[i] == b'\\' && i + 1 < bytes.len() {
216 if bytes[i + 1] == b'n' {
217 let mut backslash_count = 1;
218 while backslash_count <= i && bytes[i - backslash_count] == b'\\' {
219 backslash_count += 1;
220 }
221 if backslash_count % 2 == 1 {
222 result.push(b'\n');
223 i += 2;
224 continue;
225 }
226 }
227 result.push(bytes[i]);
228 i += 1;
229 } else {
230 result.push(bytes[i]);
231 i += 1;
232 }
233 }
234 String::from_utf8(result).unwrap_or_else(|_| text.to_string())
235}
236
237#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
239pub enum GrepMode {
240 #[default]
244 PlainText,
245 Regex,
249 Fuzzy,
254}
255
256#[derive(Debug, Clone)]
258pub struct GrepMatch {
259 pub file_index: usize,
261 pub line_number: u64,
263 pub col: usize,
265 pub byte_offset: u64,
268 pub line_content: String,
270 pub match_byte_offsets: SmallVec<[(u32, u32); 4]>,
273 pub fuzzy_score: Option<u16>,
275 pub is_definition: bool,
278 pub context_before: Vec<String>,
280 pub context_after: Vec<String>,
282}
283
284impl GrepMatch {
285 pub fn trim_leading_whitespace(&mut self) {
288 let strip_len = self.line_content.len() - self.line_content.trim_start().len();
289 if strip_len > 0 {
290 self.line_content.drain(..strip_len);
291 let off = strip_len as u32;
292 self.col = self.col.saturating_sub(strip_len);
293 for range in &mut self.match_byte_offsets {
294 range.0 = range.0.saturating_sub(off);
295 range.1 = range.1.saturating_sub(off);
296 }
297 }
298 for line in &mut self.context_before {
299 let n = line.len() - line.trim_start().len();
300 if n > 0 {
301 line.drain(..n);
302 }
303 }
304 for line in &mut self.context_after {
305 let n = line.len() - line.trim_start().len();
306 if n > 0 {
307 line.drain(..n);
308 }
309 }
310 }
311}
312
313#[derive(Debug, Clone, Default)]
315pub struct GrepResult<'a> {
316 pub matches: Vec<GrepMatch>,
317 pub files: Vec<&'a FileItem>,
319 pub total_files_searched: usize,
321 pub total_files: usize,
323 pub filtered_file_count: usize,
325 pub files_with_matches: usize,
327 pub next_file_offset: usize,
330 pub regex_fallback_error: Option<String>,
334}
335
336#[derive(Debug, Clone)]
338pub struct GrepSearchOptions {
339 pub max_file_size: u64,
340 pub max_matches_per_file: usize,
341 pub smart_case: bool,
342 pub file_offset: usize,
346 pub page_limit: usize,
348 pub mode: GrepMode,
350 pub time_budget_ms: u64,
353 pub before_context: usize,
355 pub after_context: usize,
357 pub classify_definitions: bool,
360 pub trim_whitespace: bool,
364 pub abort_signal: Option<Arc<AtomicBool>>,
369}
370
371impl Default for GrepSearchOptions {
372 fn default() -> Self {
373 Self {
374 max_file_size: 10 * 1024 * 1024,
375 max_matches_per_file: 200,
376 smart_case: true,
377 file_offset: 0,
378 page_limit: 50,
379 mode: GrepMode::default(),
380 time_budget_ms: 0,
381 before_context: 0,
382 after_context: 0,
383 classify_definitions: false,
384 trim_whitespace: false,
385 abort_signal: None,
386 }
387 }
388}
389
390#[derive(Clone, Copy)]
391struct GrepContext<'a, 'b> {
392 total_files: usize,
393 filtered_file_count: usize,
394 budget: &'a ContentCacheBudget,
395 base_path: &'a Path,
396 arena: crate::simd_path::ArenaPtr,
397 overflow_arena: crate::simd_path::ArenaPtr,
398 prefilter: Option<&'a memchr::memmem::Finder<'b>>,
399 prefilter_case_insensitive: bool,
400 abort_signal: &'a AtomicBool,
401}
402
403impl GrepContext<'_, '_> {
404 #[inline]
405 fn arena_for_file(&self, file: &FileItem) -> crate::simd_path::ArenaPtr {
406 if file.is_overflow() {
407 self.overflow_arena
408 } else {
409 self.arena
410 }
411 }
412}
413
414struct RegexMatcher<'r> {
426 regex: &'r regex::bytes::Regex,
427 is_multiline: bool,
428}
429
430impl Matcher for RegexMatcher<'_> {
431 type Error = NoError;
432
433 #[inline]
434 fn find_at(&self, haystack: &[u8], at: usize) -> Result<Option<Match>, NoError> {
435 Ok(self
436 .regex
437 .find_at(haystack, at)
438 .map(|m| Match::new(m.start(), m.end())))
439 }
440
441 #[inline]
442 fn line_terminator(&self) -> Option<fff_grep::LineTerminator> {
443 if self.is_multiline {
444 None
445 } else {
446 Some(fff_grep::LineTerminator::byte(b'\n'))
447 }
448 }
449}
450
451struct PlainTextMatcher<'a> {
462 needle: &'a [u8],
465 case_insensitive: bool,
466}
467
468impl Matcher for PlainTextMatcher<'_> {
469 type Error = NoError;
470
471 #[inline]
472 fn find_at(&self, haystack: &[u8], at: usize) -> Result<Option<Match>, NoError> {
473 let hay = &haystack[at..];
474
475 let found = if self.case_insensitive {
476 ascii_case_insensitive_find(hay, self.needle)
479 } else {
480 memchr::memmem::find(hay, self.needle)
481 };
482
483 Ok(found.map(|pos| Match::new(at + pos, at + pos + self.needle.len())))
484 }
485
486 #[inline]
487 fn line_terminator(&self) -> Option<fff_grep::LineTerminator> {
488 Some(fff_grep::LineTerminator::byte(b'\n'))
489 }
490}
491
492#[inline]
498fn ascii_case_insensitive_find(haystack: &[u8], needle_lower: &[u8]) -> Option<usize> {
499 let nlen = needle_lower.len();
500 if nlen == 0 {
501 return Some(0);
502 }
503
504 if haystack.len() < nlen {
505 return None;
506 }
507
508 let first_lo = needle_lower[0];
509 let first_hi = first_lo.to_ascii_uppercase();
510
511 if nlen == 1 {
513 return memchr::memchr2(first_lo, first_hi, haystack);
514 }
515
516 let tail = &needle_lower[1..];
517 let end = haystack.len() - nlen;
518
519 for pos in memchr::memchr2_iter(first_lo, first_hi, &haystack[..=end]) {
521 let candidate = unsafe { haystack.get_unchecked(pos + 1..pos + nlen) };
526 if ascii_case_eq(candidate, tail) {
527 return Some(pos);
528 }
529 }
530 None
531}
532
533#[inline]
538fn ascii_case_eq(a: &[u8], b: &[u8]) -> bool {
539 debug_assert_eq!(a.len(), b.len());
540 let len = a.len();
548 let mut i = 0;
549
550 while i + 8 <= len {
552 let va = u64::from_ne_bytes(unsafe { *(a.as_ptr().add(i) as *const [u8; 8]) });
553 let vb = u64::from_ne_bytes(unsafe { *(b.as_ptr().add(i) as *const [u8; 8]) });
554
555 if va != vb {
557 const MASK: u64 = 0x2020_2020_2020_2020;
559 if (va | MASK) != (vb | MASK) {
560 return false;
561 }
562 }
563 i += 8;
564 }
565
566 while i < len {
568 let ha = unsafe { *a.get_unchecked(i) };
569 let hb = unsafe { *b.get_unchecked(i) };
570 if ha != hb && (ha | 0x20) != (hb | 0x20) {
571 return false;
572 }
573 i += 1;
574 }
575
576 true
577}
578
579const MAX_LINE_DISPLAY_LEN: usize = 512;
582
583struct SinkState {
584 file_index: usize,
585 matches: Vec<GrepMatch>,
586 max_matches: usize,
587 before_context: usize,
588 after_context: usize,
589 classify_definitions: bool,
590}
591
592impl SinkState {
593 #[inline]
594 fn prepare_line<'a>(line_bytes: &'a [u8], mat: &SinkMatch<'_>) -> (&'a [u8], u32, u64, u64) {
595 let line_number = mat.line_number().unwrap_or(0);
596 let byte_offset = mat.absolute_byte_offset();
597
598 let trimmed_len = {
600 let mut len = line_bytes.len();
601 while len > 0 && matches!(line_bytes[len - 1], b'\n' | b'\r') {
602 len -= 1;
603 }
604 len
605 };
606 let trimmed_bytes = &line_bytes[..trimmed_len];
607
608 let display_bytes = truncate_display_bytes(trimmed_bytes);
610
611 let display_len = display_bytes.len() as u32;
612 (display_bytes, display_len, line_number, byte_offset)
613 }
614
615 #[inline]
616 #[allow(clippy::too_many_arguments)]
617 fn push_match(
618 &mut self,
619 line_number: u64,
620 col: usize,
621 byte_offset: u64,
622 line_content: String,
623 match_byte_offsets: SmallVec<[(u32, u32); 4]>,
624 context_before: Vec<String>,
625 context_after: Vec<String>,
626 ) {
627 let is_definition = self.classify_definitions && is_definition_line(&line_content);
628 self.matches.push(GrepMatch {
629 file_index: self.file_index,
630 line_number,
631 col,
632 byte_offset,
633 line_content,
634 match_byte_offsets,
635 fuzzy_score: None,
636 is_definition,
637 context_before,
638 context_after,
639 });
640 }
641
642 fn extract_context(&self, mat: &SinkMatch<'_>) -> (Vec<String>, Vec<String>) {
644 if self.before_context == 0 && self.after_context == 0 {
645 return (Vec::new(), Vec::new());
646 }
647
648 let buffer = mat.buffer();
649 let range = mat.bytes_range_in_buffer();
650
651 let mut before = Vec::new();
652 if self.before_context > 0 && range.start > 0 {
653 let mut pos = range.start;
655 let mut lines_found = 0;
656 while lines_found < self.before_context && pos > 0 {
657 pos -= 1;
659 let line_start = match memchr::memrchr(b'\n', &buffer[..pos]) {
661 Some(nl) => nl + 1,
662 None => 0,
663 };
664 let line = &buffer[line_start..pos];
665 let line = if line.last() == Some(&b'\r') {
667 &line[..line.len() - 1]
668 } else {
669 line
670 };
671 let truncated = truncate_display_bytes(line);
672 before.push(String::from_utf8_lossy(truncated).into_owned());
673 pos = line_start;
674 lines_found += 1;
675 }
676 before.reverse();
677 }
678
679 let mut after = Vec::new();
680 if self.after_context > 0 && range.end < buffer.len() {
681 let mut pos = range.end;
682 let mut lines_found = 0;
683 while lines_found < self.after_context && pos < buffer.len() {
684 let line_end = match memchr::memchr(b'\n', &buffer[pos..]) {
686 Some(nl) => pos + nl,
687 None => buffer.len(),
688 };
689 let line = &buffer[pos..line_end];
690 let line = if line.last() == Some(&b'\r') {
692 &line[..line.len() - 1]
693 } else {
694 line
695 };
696 let truncated = truncate_display_bytes(line);
697 after.push(String::from_utf8_lossy(truncated).into_owned());
698 pos = if line_end < buffer.len() {
699 line_end + 1 } else {
701 buffer.len()
702 };
703 lines_found += 1;
704 }
705 }
706
707 (before, after)
708 }
709}
710
711#[inline]
713fn truncate_display_bytes(bytes: &[u8]) -> &[u8] {
714 if bytes.len() <= MAX_LINE_DISPLAY_LEN {
715 bytes
716 } else {
717 let mut end = MAX_LINE_DISPLAY_LEN;
718 while end > 0 && !is_utf8_char_boundary(bytes[end]) {
719 end -= 1;
720 }
721 &bytes[..end]
722 }
723}
724
725struct PlainTextSink<'r> {
732 state: SinkState,
733 finder: &'r memchr::memmem::Finder<'r>,
734 pattern_len: u32,
735 case_insensitive: bool,
736}
737
738impl Sink for PlainTextSink<'_> {
739 type Error = std::io::Error;
740
741 fn matched(&mut self, _searcher: &Searcher, mat: &SinkMatch<'_>) -> Result<bool, Self::Error> {
742 if self.state.max_matches != 0 && self.state.matches.len() >= self.state.max_matches {
743 return Ok(false);
744 }
745
746 let line_bytes = mat.bytes();
747 let (display_bytes, display_len, line_number, byte_offset) =
748 SinkState::prepare_line(line_bytes, mat);
749
750 let line_content = String::from_utf8_lossy(display_bytes).into_owned();
751 let mut match_byte_offsets: SmallVec<[(u32, u32); 4]> = SmallVec::new();
752 let mut col = 0usize;
753 let mut first = true;
754
755 if self.case_insensitive {
756 let mut lowered = [0u8; MAX_LINE_DISPLAY_LEN];
759 let len = display_bytes.len().min(MAX_LINE_DISPLAY_LEN);
760 for (dst, &src) in lowered[..len].iter_mut().zip(display_bytes) {
761 *dst = src.to_ascii_lowercase();
762 }
763
764 let mut start_pos = 0usize;
765 while let Some(pos) = self.finder.find(&lowered[start_pos..len]) {
766 let abs_start = (start_pos + pos) as u32;
767 let abs_end = (abs_start + self.pattern_len).min(display_len);
768 if first {
769 col = abs_start as usize;
770 first = false;
771 }
772 match_byte_offsets.push((abs_start, abs_end));
773 start_pos += pos + 1;
774 }
775 } else {
776 let mut start_pos = 0usize;
777 while let Some(pos) = self.finder.find(&display_bytes[start_pos..]) {
778 let abs_start = (start_pos + pos) as u32;
779 let abs_end = (abs_start + self.pattern_len).min(display_len);
780 if first {
781 col = abs_start as usize;
782 first = false;
783 }
784 match_byte_offsets.push((abs_start, abs_end));
785 start_pos += pos + 1;
786 }
787 }
788
789 let (context_before, context_after) = self.state.extract_context(mat);
790 self.state.push_match(
791 line_number,
792 col,
793 byte_offset,
794 line_content,
795 match_byte_offsets,
796 context_before,
797 context_after,
798 );
799 Ok(true)
800 }
801
802 fn finish(&mut self, _: &Searcher, _: &fff_grep::SinkFinish) -> Result<(), Self::Error> {
803 Ok(())
804 }
805}
806
807struct RegexSink<'r> {
812 state: SinkState,
813 re: &'r regex::bytes::Regex,
814}
815
816impl Sink for RegexSink<'_> {
817 type Error = std::io::Error;
818
819 fn matched(
820 &mut self,
821 _searcher: &Searcher,
822 sink_match: &SinkMatch<'_>,
823 ) -> Result<bool, Self::Error> {
824 if self.state.max_matches != 0 && self.state.matches.len() >= self.state.max_matches {
825 return Ok(false);
826 }
827
828 let line_bytes = sink_match.bytes();
829 let (display_bytes, display_len, line_number, byte_offset) =
830 SinkState::prepare_line(line_bytes, sink_match);
831
832 let line_content = String::from_utf8_lossy(display_bytes).into_owned();
833 let mut match_byte_offsets: SmallVec<[(u32, u32); 4]> = SmallVec::new();
834 let mut col = 0usize;
835 let mut first = true;
836
837 for m in self.re.find_iter(display_bytes) {
838 let abs_start = m.start() as u32;
839 let abs_end = (m.end() as u32).min(display_len);
840 if first {
841 col = abs_start as usize;
842 first = false;
843 }
844 match_byte_offsets.push((abs_start, abs_end));
845 }
846
847 let (context_before, context_after) = self.state.extract_context(sink_match);
848 self.state.push_match(
849 line_number,
850 col,
851 byte_offset,
852 line_content,
853 match_byte_offsets,
854 context_before,
855 context_after,
856 );
857 Ok(true)
858 }
859
860 fn finish(&mut self, _: &Searcher, _: &fff_grep::SinkFinish) -> Result<(), Self::Error> {
861 Ok(())
862 }
863}
864
865struct AhoCorasickMatcher<'a> {
870 ac: &'a AhoCorasick,
871}
872
873impl Matcher for AhoCorasickMatcher<'_> {
874 type Error = NoError;
875
876 #[inline]
877 fn find_at(&self, haystack: &[u8], at: usize) -> std::result::Result<Option<Match>, NoError> {
878 let hay = &haystack[at..];
879 let found: Option<aho_corasick::Match> = self.ac.find(hay);
880 Ok(found.map(|m| Match::new(at + m.start(), at + m.end())))
881 }
882
883 #[inline]
884 fn line_terminator(&self) -> Option<fff_grep::LineTerminator> {
885 Some(fff_grep::LineTerminator::byte(b'\n'))
886 }
887}
888
889struct AhoCorasickSink<'a> {
893 state: SinkState,
894 ac: &'a AhoCorasick,
895}
896
897impl Sink for AhoCorasickSink<'_> {
898 type Error = std::io::Error;
899
900 fn matched(&mut self, _searcher: &Searcher, mat: &SinkMatch<'_>) -> Result<bool, Self::Error> {
901 if self.state.max_matches != 0 && self.state.matches.len() >= self.state.max_matches {
902 return Ok(false);
903 }
904
905 let line_bytes = mat.bytes();
906 let (display_bytes, display_len, line_number, byte_offset) =
907 SinkState::prepare_line(line_bytes, mat);
908
909 let line_content = String::from_utf8_lossy(display_bytes).into_owned();
910 let mut match_byte_offsets: SmallVec<[(u32, u32); 4]> = SmallVec::new();
911 let mut col = 0usize;
912 let mut first = true;
913
914 for m in self.ac.find_iter(display_bytes as &[u8]) {
915 let abs_start = m.start() as u32;
916 let abs_end = (m.end() as u32).min(display_len);
917 if first {
918 col = abs_start as usize;
919 first = false;
920 }
921 match_byte_offsets.push((abs_start, abs_end));
922 }
923
924 let (context_before, context_after) = self.state.extract_context(mat);
925 self.state.push_match(
926 line_number,
927 col,
928 byte_offset,
929 line_content,
930 match_byte_offsets,
931 context_before,
932 context_after,
933 );
934 Ok(true)
935 }
936
937 fn finish(&mut self, _: &Searcher, _: &fff_grep::SinkFinish) -> Result<(), Self::Error> {
938 Ok(())
939 }
940}
941
942#[allow(clippy::too_many_arguments)]
950pub(crate) fn multi_grep_search<'a>(
951 files: &'a [FileItem],
952 patterns: &[&str],
953 constraints: &[fff_query_parser::Constraint<'_>],
954 options: &GrepSearchOptions,
955 budget: &ContentCacheBudget,
956 bigram_index: Option<&BigramFilter>,
957 bigram_overlay: Option<&BigramOverlay>,
958 abort_signal: &AtomicBool,
959 base_path: &Path,
960 arena: crate::simd_path::ArenaPtr,
961 overflow_arena: crate::simd_path::ArenaPtr,
962) -> GrepResult<'a> {
963 let total_files = files.len();
964
965 if patterns.is_empty() || patterns.iter().all(|p| p.is_empty()) {
966 return GrepResult {
967 total_files,
968 filtered_file_count: total_files,
969 ..Default::default()
970 };
971 }
972
973 let bigram_candidates = if let Some(idx) = bigram_index
976 && idx.is_ready()
977 {
978 let mut combined: Option<Vec<u64>> = None;
979 for pattern in patterns {
980 if let Some(candidates) = idx.query(pattern.as_bytes()) {
981 combined = Some(match combined {
982 None => candidates,
983 Some(mut acc) => {
984 acc.iter_mut()
986 .zip(candidates.iter())
987 .for_each(|(a, b)| *a |= *b);
988 acc
989 }
990 });
991 }
992 }
993
994 if let Some(ref mut candidates) = combined
995 && let Some(overlay) = bigram_overlay
996 {
997 for pattern in patterns {
998 let pattern_bigrams = extract_bigrams(pattern.as_bytes());
999 for file_idx in overlay.query_modified(&pattern_bigrams) {
1000 let word = file_idx / 64;
1001 if word < candidates.len() {
1002 candidates[word] |= 1u64 << (file_idx % 64);
1003 }
1004 }
1005 }
1006 }
1007
1008 combined
1009 } else {
1010 None
1011 };
1012
1013 let (mut files_to_search, mut filtered_file_count) =
1014 prepare_files_to_search(files, constraints, options, arena);
1015
1016 if files_to_search.is_empty()
1019 && let Some(stripped) = strip_file_path_constraints(constraints)
1020 {
1021 let (retry_files, retry_count) = prepare_files_to_search(files, &stripped, options, arena);
1022 files_to_search = retry_files;
1023 filtered_file_count = retry_count;
1024 }
1025
1026 if let Some(ref candidates) = bigram_candidates {
1028 let base_ptr = files.as_ptr();
1029 files_to_search.retain(|f| {
1030 if f.is_overflow() {
1031 return true;
1032 }
1033 let file_idx = unsafe { (*f as *const FileItem).offset_from(base_ptr) as usize };
1034 BigramFilter::is_candidate(candidates, file_idx)
1035 });
1036 }
1037
1038 if files_to_search.is_empty() {
1039 return GrepResult {
1040 total_files,
1041 filtered_file_count,
1042 ..Default::default()
1043 };
1044 }
1045
1046 let case_insensitive = if options.smart_case {
1048 !patterns.iter().any(|p| p.chars().any(|c| c.is_uppercase()))
1049 } else {
1050 false
1051 };
1052
1053 let ac = aho_corasick::AhoCorasickBuilder::new()
1054 .ascii_case_insensitive(case_insensitive)
1055 .build(patterns)
1056 .expect("Aho-Corasick build should not fail for literal patterns");
1057
1058 let searcher = {
1059 let mut b = SearcherBuilder::new();
1060 b.line_number(true);
1061 b
1062 }
1063 .build();
1064
1065 let ac_matcher = AhoCorasickMatcher { ac: &ac };
1066 perform_grep(
1067 &files_to_search,
1068 options,
1069 &GrepContext {
1070 total_files,
1071 filtered_file_count,
1072 budget,
1073 base_path,
1074 arena,
1075 overflow_arena,
1076 prefilter: None, prefilter_case_insensitive: false,
1078 abort_signal,
1079 },
1080 |file_bytes: &[u8], max_matches: usize| {
1081 let state = SinkState {
1082 file_index: 0,
1083 matches: Vec::with_capacity(4),
1084 max_matches,
1085 before_context: options.before_context,
1086 after_context: options.after_context,
1087 classify_definitions: options.classify_definitions,
1088 };
1089
1090 let mut sink = AhoCorasickSink { state, ac: &ac };
1091
1092 if let Err(e) = searcher.search_slice(&ac_matcher, file_bytes, &mut sink) {
1093 tracing::error!(error = %e, "Grep (aho-corasick multi) search failed");
1094 }
1095
1096 sink.state.matches
1097 },
1098 )
1099}
1100
1101#[inline]
1103const fn is_utf8_char_boundary(b: u8) -> bool {
1104 (b as i8) >= -0x40
1105}
1106
1107fn build_regex(pattern: &str, smart_case: bool) -> Result<regex::bytes::Regex, String> {
1119 if pattern.is_empty() {
1120 return Err("empty pattern".to_string());
1121 }
1122
1123 let regex_pattern = if pattern.contains("\\n") {
1124 pattern.replace("\\n", "\n")
1125 } else {
1126 pattern.to_string()
1127 };
1128
1129 let case_insensitive = if smart_case {
1130 !pattern.chars().any(|c| c.is_uppercase())
1131 } else {
1132 false
1133 };
1134
1135 regex::bytes::RegexBuilder::new(®ex_pattern)
1136 .case_insensitive(case_insensitive)
1137 .multi_line(true)
1138 .unicode(false)
1139 .build()
1140 .map_err(|e| e.to_string())
1141}
1142
1143fn char_indices_to_byte_offsets(line: &str, char_indices: &[usize]) -> SmallVec<[(u32, u32); 4]> {
1153 if char_indices.is_empty() {
1154 return SmallVec::new();
1155 }
1156
1157 let char_byte_ranges: Vec<(usize, usize)> = line
1160 .char_indices()
1161 .map(|(byte_pos, ch)| (byte_pos, byte_pos + ch.len_utf8()))
1162 .collect();
1163
1164 let mut result: SmallVec<[(u32, u32); 4]> = SmallVec::with_capacity(char_indices.len());
1166
1167 for &ci in char_indices {
1168 if ci >= char_byte_ranges.len() {
1169 continue; }
1171 let (start, end) = char_byte_ranges[ci];
1172 if let Some(last) = result.last_mut()
1174 && last.1 == start as u32
1175 {
1176 last.1 = end as u32;
1177 continue;
1178 }
1179 result.push((start as u32, end as u32));
1180 }
1181
1182 result
1183}
1184
1185use crate::case_insensitive_memmem;
1186
1187const PAGINATED_CHUNK_SIZE: usize = 512;
1191
1192#[tracing::instrument(skip_all, level = Level::DEBUG, fields(prefiltered_count = files_to_search.len()))]
1193fn perform_grep<'a, F>(
1194 files_to_search: &[&'a FileItem],
1195 options: &GrepSearchOptions,
1196 ctx: &GrepContext<'_, '_>,
1197 search_file: F,
1198) -> GrepResult<'a>
1199where
1200 F: Fn(&[u8], usize) -> Vec<GrepMatch> + Sync,
1201{
1202 let time_budget = if options.time_budget_ms > 0 {
1203 Some(std::time::Duration::from_millis(options.time_budget_ms))
1204 } else {
1205 None
1206 };
1207
1208 let search_start = std::time::Instant::now();
1209 let page_limit = options.page_limit;
1210 let budget_exceeded = AtomicBool::new(false);
1211
1212 let chunk_size = if page_limit < usize::MAX {
1223 PAGINATED_CHUNK_SIZE
1224 } else {
1225 files_to_search.len().max(1)
1226 };
1227
1228 let mut result_files: Vec<&'a FileItem> = Vec::new();
1229 let mut all_matches: Vec<GrepMatch> = Vec::new();
1230 let mut files_consumed: usize = 0;
1231 let mut page_filled = false;
1232
1233 for chunk in files_to_search.chunks(chunk_size) {
1234 let chunk_offset = files_consumed;
1235
1236 let chunk_results: Vec<(usize, &'a FileItem, Vec<GrepMatch>)> = chunk
1240 .par_iter()
1241 .enumerate()
1242 .map_init(
1243 || Vec::with_capacity(64 * 1024),
1245 |buf, (local_idx, file)| {
1246 if ctx.abort_signal.load(Ordering::Relaxed) {
1247 budget_exceeded.store(true, Ordering::Relaxed);
1248 return None;
1249 }
1250
1251 if let Some(budget) = time_budget
1252 && all_matches.len() > 1
1253 && search_start.elapsed() > budget
1254 {
1255 budget_exceeded.store(true, Ordering::Relaxed);
1256 return None;
1257 }
1258
1259 let content = file.get_content_for_search(
1260 buf,
1261 ctx.arena_for_file(file),
1262 ctx.base_path,
1263 ctx.budget,
1264 )?;
1265
1266 if let Some(pf) = ctx.prefilter {
1270 let found = if ctx.prefilter_case_insensitive {
1271 case_insensitive_memmem::search_packed_pair(content, pf.needle())
1272 } else {
1273 pf.find(content).is_some()
1274 };
1275 if !found {
1276 return None;
1277 }
1278 }
1279
1280 let file_matches = search_file(content, options.max_matches_per_file);
1281
1282 if file_matches.is_empty() {
1283 return None;
1284 }
1285
1286 Some((chunk_offset + local_idx, *file, file_matches))
1287 },
1288 )
1289 .flatten()
1290 .collect();
1291
1292 files_consumed = chunk_offset + chunk.len();
1294
1295 for (batch_idx, file, file_matches) in chunk_results {
1297 let file_result_idx = result_files.len();
1298 result_files.push(file);
1299
1300 for mut m in file_matches {
1301 m.file_index = file_result_idx;
1302 if options.trim_whitespace {
1303 m.trim_leading_whitespace();
1304 }
1305 all_matches.push(m);
1306 }
1307
1308 if all_matches.len() >= page_limit {
1309 files_consumed = batch_idx + 1;
1312 page_filled = true;
1313 break;
1314 }
1315 }
1316
1317 if page_filled || budget_exceeded.load(Ordering::Relaxed) {
1318 break;
1319 }
1320 }
1321
1322 if result_files.is_empty() {
1324 files_consumed = files_to_search.len();
1325 }
1326
1327 let has_more = budget_exceeded.load(Ordering::Relaxed)
1328 || (page_filled && files_consumed < files_to_search.len());
1329
1330 let next_file_offset = if has_more {
1331 options.file_offset + files_consumed
1332 } else {
1333 0
1334 };
1335
1336 GrepResult {
1337 matches: all_matches,
1338 files_with_matches: result_files.len(),
1339 files: result_files,
1340 total_files_searched: files_consumed,
1341 total_files: ctx.total_files,
1342 filtered_file_count: ctx.filtered_file_count,
1343 next_file_offset,
1344 regex_fallback_error: None,
1345 }
1346}
1347
1348fn collect_grep_results<'a>(
1353 per_file_results: Vec<(usize, &'a FileItem, Vec<GrepMatch>)>,
1354 files_to_search_len: usize,
1355 options: &GrepSearchOptions,
1356 total_files: usize,
1357 filtered_file_count: usize,
1358 budget_exceeded: bool,
1359) -> GrepResult<'a> {
1360 let page_limit = options.page_limit;
1361
1362 let mut result_files: Vec<&'a FileItem> = Vec::new();
1366 let mut all_matches: Vec<GrepMatch> = Vec::new();
1367 let mut files_consumed: usize = 0;
1374
1375 for (batch_idx, file, file_matches) in per_file_results {
1376 files_consumed = batch_idx + 1;
1379
1380 let file_result_idx = result_files.len();
1381 result_files.push(file);
1382
1383 for mut m in file_matches {
1384 m.file_index = file_result_idx;
1385 if options.trim_whitespace {
1386 m.trim_leading_whitespace();
1387 }
1388 all_matches.push(m);
1389 }
1390
1391 if all_matches.len() >= page_limit {
1395 break;
1396 }
1397 }
1398
1399 if result_files.is_empty() {
1401 files_consumed = files_to_search_len;
1402 }
1403
1404 let has_more = budget_exceeded
1405 || (all_matches.len() >= page_limit && files_consumed < files_to_search_len);
1406
1407 let next_file_offset = if has_more {
1408 options.file_offset + files_consumed
1409 } else {
1410 0
1411 };
1412
1413 GrepResult {
1414 matches: all_matches,
1415 files_with_matches: result_files.len(),
1416 files: result_files,
1417 total_files_searched: files_consumed,
1418 total_files,
1419 filtered_file_count,
1420 next_file_offset,
1421 regex_fallback_error: None,
1422 }
1423}
1424
1425fn prepare_files_to_search<'a>(
1431 files: &'a [FileItem],
1432 constraints: &[fff_query_parser::Constraint<'_>],
1433 options: &GrepSearchOptions,
1434 arena: crate::simd_path::ArenaPtr,
1435) -> (Vec<&'a FileItem>, usize) {
1436 let prefiltered: Vec<&FileItem> = if constraints.is_empty() {
1437 files
1438 .iter()
1439 .filter(|f| {
1440 !f.is_deleted() && !f.is_binary() && f.size > 0 && f.size <= options.max_file_size
1441 })
1442 .collect()
1443 } else {
1444 match apply_constraints(files, constraints, arena) {
1445 Some(constrained) => constrained
1446 .into_iter()
1447 .filter(|f| {
1448 !f.is_deleted()
1449 && !f.is_binary()
1450 && f.size > 0
1451 && f.size <= options.max_file_size
1452 })
1453 .collect(),
1454 None => files
1455 .iter()
1456 .filter(|f| {
1457 !f.is_deleted()
1458 && !f.is_binary()
1459 && f.size > 0
1460 && f.size <= options.max_file_size
1461 })
1462 .collect(),
1463 }
1464 };
1465
1466 let total_count = prefiltered.len();
1467 let mut sorted_files = prefiltered;
1468
1469 let needs_sort = sorted_files
1473 .iter()
1474 .any(|f| f.total_frecency_score() != 0 || f.modified != 0);
1475
1476 if needs_sort {
1477 sort_with_buffer(&mut sorted_files, |a, b| {
1478 b.total_frecency_score()
1479 .cmp(&a.total_frecency_score())
1480 .then(b.modified.cmp(&a.modified))
1481 });
1482 }
1483
1484 if options.file_offset > 0 && options.file_offset < total_count {
1485 let paginated = sorted_files.split_off(options.file_offset);
1486 (paginated, total_count)
1487 } else if options.file_offset >= total_count {
1488 (Vec::new(), total_count)
1489 } else {
1490 (sorted_files, total_count)
1492 }
1493}
1494
1495#[allow(clippy::too_many_arguments)]
1529fn fuzzy_grep_search<'a>(
1530 grep_text: &str,
1531 files_to_search: &[&'a FileItem],
1532 options: &GrepSearchOptions,
1533 total_files: usize,
1534 filtered_file_count: usize,
1535 case_insensitive: bool,
1536 budget: &ContentCacheBudget,
1537 abort_signal: &AtomicBool,
1538 base_path: &Path,
1539 arena: crate::simd_path::ArenaPtr,
1540 _overflow_arena: crate::simd_path::ArenaPtr,
1541) -> GrepResult<'a> {
1542 let max_typos = (grep_text.len() / 3).min(2);
1552 let scoring = neo_frizbee::Scoring {
1553 exact_match_bonus: 100,
1558 prefix_bonus: 0,
1561 capitalization_bonus: if case_insensitive { 0 } else { 4 },
1562 ..neo_frizbee::Scoring::default()
1563 };
1564
1565 let matcher = neo_frizbee::Matcher::new(
1566 grep_text,
1567 &neo_frizbee::Config {
1568 max_typos: Some(max_typos as u16),
1570 sort: false,
1571 scoring,
1572 },
1573 );
1574
1575 let perfect_score = (grep_text.len() as u16) * 16;
1579 let min_score = (perfect_score * 50) / 100;
1580
1581 let max_match_span = grep_text.len() * 3;
1585 let needle_len = grep_text.len();
1586
1587 let max_gaps = (needle_len / 3).max(2);
1591
1592 let needle_bytes = grep_text.as_bytes();
1596 let mut unique_needle_chars: Vec<u8> = Vec::new();
1597 for &b in needle_bytes {
1598 let lo = b.to_ascii_lowercase();
1599 let hi = b.to_ascii_uppercase();
1600 if !unique_needle_chars.contains(&lo) {
1601 unique_needle_chars.push(lo);
1602 }
1603 if lo != hi && !unique_needle_chars.contains(&hi) {
1604 unique_needle_chars.push(hi);
1605 }
1606 }
1607
1608 let unique_count = {
1611 let mut seen = [false; 256];
1612 for &b in needle_bytes {
1613 seen[b.to_ascii_lowercase() as usize] = true;
1614 }
1615 seen.iter().filter(|&&v| v).count()
1616 };
1617 let min_chars_required = unique_count.saturating_sub(max_typos);
1618
1619 let time_budget = if options.time_budget_ms > 0 {
1620 Some(std::time::Duration::from_millis(options.time_budget_ms))
1621 } else {
1622 None
1623 };
1624 let search_start = std::time::Instant::now();
1625 let budget_exceeded = AtomicBool::new(false);
1626 let max_matches_per_file = options.max_matches_per_file;
1627 let per_file_results: Vec<(usize, &'a FileItem, Vec<GrepMatch>)> = files_to_search
1631 .par_iter()
1632 .enumerate()
1633 .map_init(
1634 || (matcher.clone(), Vec::with_capacity(64 * 1024)),
1635 |(matcher, buf), (idx, file)| {
1636 if abort_signal.load(Ordering::Relaxed) {
1637 budget_exceeded.store(true, Ordering::Relaxed);
1638 return None;
1639 }
1640
1641 if let Some(budget) = time_budget
1642 && search_start.elapsed() > budget
1643 {
1644 budget_exceeded.store(true, Ordering::Relaxed);
1645 return None;
1646 }
1647
1648 let file_bytes = file.get_content_for_search(buf, arena, base_path, budget)?;
1649
1650 if min_chars_required > 0 {
1653 let mut chars_found = 0usize;
1654 for &ch in &unique_needle_chars {
1655 if memchr::memchr(ch, file_bytes).is_some() {
1656 chars_found += 1;
1657 if chars_found >= min_chars_required {
1658 break;
1659 }
1660 }
1661 }
1662 if chars_found < min_chars_required {
1663 return None;
1664 }
1665 }
1666
1667 let file_is_utf8 = std::str::from_utf8(file_bytes).is_ok();
1671
1672 let mut stepper = LineStep::new(b'\n', 0, file_bytes.len());
1674 let estimated_lines = (file_bytes.len() / 40).max(64);
1675 let mut file_lines: Vec<&str> = Vec::with_capacity(estimated_lines);
1676 let mut line_meta: Vec<(u64, u64)> = Vec::with_capacity(estimated_lines);
1677 let line_term_lf = fff_grep::LineTerminator::byte(b'\n');
1678 let line_term_cr = fff_grep::LineTerminator::byte(b'\r');
1679
1680 let mut line_number: u64 = 1;
1681 while let Some(line_match) = stepper.next_match(file_bytes) {
1682 let byte_offset = line_match.start() as u64;
1683
1684 let trimmed = lines::without_terminator(
1686 lines::without_terminator(&file_bytes[line_match], line_term_lf),
1687 line_term_cr,
1688 );
1689
1690 if !trimmed.is_empty() {
1691 let line_str = if file_is_utf8 {
1695 unsafe { std::str::from_utf8_unchecked(trimmed) }
1696 } else if let Ok(s) = std::str::from_utf8(trimmed) {
1697 s
1698 } else {
1699 line_number += 1;
1700 continue;
1701 };
1702 file_lines.push(line_str);
1703 line_meta.push((line_number, byte_offset));
1704 }
1705
1706 line_number += 1;
1707 }
1708
1709 if file_lines.is_empty() {
1710 return None;
1711 }
1712
1713 let matches_with_indices = matcher.match_list_indices(&file_lines);
1715 let mut file_matches: Vec<GrepMatch> = Vec::new();
1716
1717 for mut match_indices in matches_with_indices {
1718 if match_indices.score < min_score {
1719 continue;
1720 }
1721
1722 let idx = match_indices.index as usize;
1723 let raw_line = file_lines[idx];
1724
1725 let truncated = truncate_display_bytes(raw_line.as_bytes());
1726 let display_line = if truncated.len() < raw_line.len() {
1727 &raw_line[..truncated.len()]
1729 } else {
1730 raw_line
1731 };
1732
1733 if display_line.len() < raw_line.len() {
1735 let Some(re_indices) = matcher
1736 .match_list_indices(&[display_line])
1737 .into_iter()
1738 .next()
1739 else {
1740 continue;
1741 };
1742 match_indices = re_indices;
1743 }
1744
1745 match_indices.indices.sort_unstable();
1747
1748 let min_matched = needle_len.saturating_sub(max_typos).max(1);
1752 if match_indices.indices.len() < min_matched {
1753 continue;
1754 }
1755
1756 let indices = &match_indices.indices;
1757
1758 if let (Some(&first), Some(&last)) = (indices.first(), indices.last()) {
1759 let span = last - first + 1;
1761 if span > max_match_span {
1762 continue;
1763 }
1764
1765 let density = (indices.len() * 100) / span;
1771 let min_density = if indices.len() >= needle_len {
1772 45 } else {
1774 65 };
1776 if density < min_density {
1777 continue;
1778 }
1779
1780 let gap_count = indices.windows(2).filter(|w| w[1] != w[0] + 1).count();
1782 if gap_count > max_gaps {
1783 continue;
1784 }
1785 }
1786
1787 let (ln, bo) = line_meta[idx];
1788 let match_byte_offsets =
1789 char_indices_to_byte_offsets(display_line, &match_indices.indices);
1790 let col = match_byte_offsets
1791 .first()
1792 .map(|r| r.0 as usize)
1793 .unwrap_or(0);
1794
1795 file_matches.push(GrepMatch {
1796 file_index: 0,
1797 line_number: ln,
1798 col,
1799 byte_offset: bo,
1800 is_definition: options.classify_definitions
1801 && is_definition_line(display_line),
1802 line_content: display_line.to_string(),
1803 match_byte_offsets,
1804 fuzzy_score: Some(match_indices.score),
1805 context_before: Vec::new(),
1806 context_after: Vec::new(),
1807 });
1808
1809 if max_matches_per_file != 0 && file_matches.len() >= max_matches_per_file {
1810 break;
1811 }
1812 }
1813
1814 if file_matches.is_empty() {
1815 return None;
1816 }
1817
1818 Some((idx, *file, file_matches))
1819 },
1820 )
1821 .flatten()
1822 .collect();
1823
1824 collect_grep_results(
1825 per_file_results,
1826 files_to_search.len(),
1827 options,
1828 total_files,
1829 filtered_file_count,
1830 budget_exceeded.load(Ordering::Relaxed),
1831 )
1832}
1833
1834#[tracing::instrument(skip_all, fields(file_count = files.len()))]
1839#[allow(clippy::too_many_arguments)]
1840pub(crate) fn grep_search<'a>(
1841 files: &'a [FileItem],
1842 query: &FFFQuery<'_>,
1843 options: &GrepSearchOptions,
1844 budget: &ContentCacheBudget,
1845 bigram_index: Option<&BigramFilter>,
1846 bigram_overlay: Option<&BigramOverlay>,
1847 abort_signal: &AtomicBool,
1848 base_path: &Path,
1849 arena: crate::simd_path::ArenaPtr,
1850 overflow_arena: crate::simd_path::ArenaPtr,
1851) -> GrepResult<'a> {
1852 let total_files = files.len();
1853
1854 let constraints_from_query = &query.constraints[..];
1860
1861 let grep_text = if !matches!(query.fuzzy_query, fff_query_parser::FuzzyQuery::Empty) {
1862 query.grep_text()
1863 } else {
1864 let t = query.raw_query.trim();
1866 if t.starts_with('\\') && t.len() > 1 {
1867 let suffix = &t[1..];
1868 let parser = QueryParser::new(GrepConfig);
1869 if !parser.parse(suffix).constraints.is_empty() {
1870 suffix.to_string()
1871 } else {
1872 t.to_string()
1873 }
1874 } else {
1875 t.to_string()
1876 }
1877 };
1878
1879 if grep_text.is_empty() {
1880 return GrepResult {
1881 total_files,
1882 filtered_file_count: total_files,
1883 next_file_offset: 0,
1884 matches: Vec::with_capacity(4),
1885 files: Vec::new(),
1886 ..Default::default()
1887 };
1888 }
1889
1890 let case_insensitive = if options.smart_case {
1891 !grep_text.chars().any(|c| c.is_uppercase())
1892 } else {
1893 false
1894 };
1895
1896 let mut regex_fallback_error: Option<String> = None;
1897 let regex = match options.mode {
1898 GrepMode::PlainText => None,
1899 GrepMode::Fuzzy => {
1900 let (mut files_to_search, mut filtered_file_count) =
1901 prepare_files_to_search(files, constraints_from_query, options, arena);
1902
1903 if files_to_search.is_empty()
1904 && let Some(stripped) = strip_file_path_constraints(constraints_from_query)
1905 {
1906 let (retry_files, retry_count) =
1907 prepare_files_to_search(files, &stripped, options, arena);
1908 files_to_search = retry_files;
1909 filtered_file_count = retry_count;
1910 }
1911
1912 if files_to_search.is_empty() {
1913 return GrepResult {
1914 total_files,
1915 filtered_file_count,
1916 next_file_offset: 0,
1917 ..Default::default()
1918 };
1919 }
1920
1921 if let Some(idx) = bigram_index
1925 && idx.is_ready()
1926 {
1927 let bq = fuzzy_to_bigram_query(&grep_text, 7);
1928 if !bq.is_any()
1929 && let Some(mut candidates) = bq.evaluate(idx)
1930 {
1931 if let Some(overlay) = bigram_overlay {
1932 for (r, t) in candidates.iter_mut().zip(overlay.tombstones().iter()) {
1933 *r &= !t;
1934 }
1935 for file_idx in overlay.modified_indices() {
1937 let word = file_idx / 64;
1938 if word < candidates.len() {
1939 candidates[word] |= 1u64 << (file_idx % 64);
1940 }
1941 }
1942 }
1943
1944 let base_ptr = files.as_ptr();
1945 files_to_search.retain(|f| {
1946 if f.is_overflow() {
1947 return true;
1948 }
1949
1950 let file_idx =
1951 unsafe { (*f as *const FileItem).offset_from(base_ptr) as usize };
1952
1953 BigramFilter::is_candidate(&candidates, file_idx)
1954 });
1955 }
1956 }
1957
1958 return fuzzy_grep_search(
1959 &grep_text,
1960 &files_to_search,
1961 options,
1962 total_files,
1963 filtered_file_count,
1964 case_insensitive,
1965 budget,
1966 abort_signal,
1967 base_path,
1968 arena,
1969 overflow_arena,
1970 );
1971 }
1972 GrepMode::Regex => build_regex(&grep_text, options.smart_case)
1973 .inspect_err(|err| {
1974 tracing::warn!("Regex compilation failed for {}. Error {}", grep_text, err);
1975
1976 regex_fallback_error = Some(err.to_string());
1977 })
1978 .ok(),
1979 };
1980
1981 let is_multiline = has_unescaped_newline_escape(&grep_text);
1982
1983 let effective_pattern = if is_multiline {
1984 replace_unescaped_newline_escapes(&grep_text)
1985 } else {
1986 grep_text.to_string()
1987 };
1988
1989 let finder_pattern: Vec<u8> = if case_insensitive {
1992 effective_pattern.as_bytes().to_ascii_lowercase()
1993 } else {
1994 effective_pattern.as_bytes().to_vec()
1995 };
1996 let finder = memchr::memmem::Finder::new(&finder_pattern);
1997 let pattern_len = finder_pattern.len() as u32;
1998
1999 let bigram_candidates = if let Some(idx) = bigram_index
2005 && idx.is_ready()
2006 {
2007 let raw_candidates = if regex.is_none() {
2008 idx.query(effective_pattern.as_bytes())
2010 } else {
2011 let bq = regex_to_bigram_query(&effective_pattern);
2013 if !bq.is_any() { bq.evaluate(idx) } else { None }
2014 };
2015
2016 if let Some(mut candidates) = raw_candidates {
2017 if let Some(overlay) = bigram_overlay {
2018 for (r, t) in candidates.iter_mut().zip(overlay.tombstones().iter()) {
2020 *r &= !t;
2021 }
2022
2023 if regex.is_none() {
2024 let pattern_bigrams = extract_bigrams(effective_pattern.as_bytes());
2025 for file_idx in overlay.query_modified(&pattern_bigrams) {
2026 let word = file_idx / 64;
2027 if word < candidates.len() {
2028 candidates[word] |= 1u64 << (file_idx % 64);
2029 }
2030 }
2031 } else {
2032 for file_idx in overlay.modified_indices() {
2033 let word = file_idx / 64;
2034 if word < candidates.len() {
2035 candidates[word] |= 1u64 << (file_idx % 64);
2036 }
2037 }
2038 }
2039 }
2040 Some(candidates)
2041 } else {
2042 None
2043 }
2044 } else {
2045 None
2046 };
2047
2048 let overflow_start = bigram_overlay
2052 .map(|o| o.base_file_count())
2053 .unwrap_or(files.len());
2054
2055 let (files_to_search, filtered_file_count) = match bigram_candidates {
2057 Some(ref candidates) if constraints_from_query.is_empty() => {
2058 let overflow_count = files.len().saturating_sub(overflow_start);
2060 let cap = BigramFilter::count_candidates(candidates) + overflow_count;
2061 let mut result: Vec<&FileItem> = Vec::with_capacity(cap);
2062
2063 for (word_idx, &word) in candidates.iter().enumerate() {
2064 if word == 0 {
2065 continue;
2066 }
2067 let base = word_idx * 64;
2068 let mut bits = word;
2069 while bits != 0 {
2070 let bit = bits.trailing_zeros() as usize;
2071 let file_idx = base + bit;
2072 if file_idx < overflow_start {
2075 let f = unsafe { files.get_unchecked(file_idx) };
2076 if !f.is_binary() && f.size <= options.max_file_size {
2077 result.push(f);
2078 }
2079 }
2080 bits &= bits - 1;
2081 }
2082 }
2083
2084 for f in &files[overflow_start..] {
2087 if !f.is_binary() && !f.is_deleted() && f.size <= options.max_file_size {
2088 result.push(f);
2089 }
2090 }
2091
2092 let total_searchable = files.len();
2093 let needs_sort = result
2094 .iter()
2095 .any(|f| f.total_frecency_score() != 0 || f.modified != 0);
2096
2097 if needs_sort {
2098 sort_with_buffer(&mut result, |a, b| {
2099 b.total_frecency_score()
2100 .cmp(&a.total_frecency_score())
2101 .then(b.modified.cmp(&a.modified))
2102 });
2103 }
2104
2105 if options.file_offset > 0 && options.file_offset < result.len() {
2106 let paginated = result.split_off(options.file_offset);
2107 (paginated, total_searchable)
2108 } else if options.file_offset >= result.len() {
2109 (Vec::new(), total_searchable)
2110 } else {
2111 (result, total_searchable)
2112 }
2113 }
2114 _ => {
2115 let (mut fts, mut fc) =
2116 prepare_files_to_search(files, constraints_from_query, options, arena);
2117
2118 if fts.is_empty()
2119 && let Some(stripped) = strip_file_path_constraints(constraints_from_query)
2120 {
2121 let (retry_files, retry_count) =
2122 prepare_files_to_search(files, &stripped, options, arena);
2123 fts = retry_files;
2124 fc = retry_count;
2125 }
2126
2127 if let Some(ref candidates) = bigram_candidates {
2128 let base_ptr = files.as_ptr();
2129 fts.retain(|f| {
2130 if f.is_overflow() {
2131 return true;
2132 }
2133
2134 let file_idx =
2136 unsafe { (*f as *const FileItem).offset_from(base_ptr) as usize };
2137 BigramFilter::is_candidate(candidates, file_idx)
2138 });
2139 }
2140
2141 (fts, fc)
2142 }
2143 };
2144
2145 if files_to_search.is_empty() {
2146 return GrepResult {
2147 total_files,
2148 filtered_file_count,
2149 next_file_offset: 0,
2150 ..Default::default()
2151 };
2152 }
2153
2154 let plain_matcher = PlainTextMatcher {
2157 needle: &finder_pattern,
2158 case_insensitive,
2159 };
2160
2161 let searcher = {
2162 let mut b = SearcherBuilder::new();
2163 b.line_number(true).multi_line(is_multiline);
2164 b
2165 }
2166 .build();
2167
2168 let should_prefilter = regex.is_none();
2169 let mut result = perform_grep(
2170 &files_to_search,
2171 options,
2172 &GrepContext {
2173 total_files,
2174 filtered_file_count,
2175 budget,
2176 base_path,
2177 arena,
2178 overflow_arena,
2179 prefilter: should_prefilter.then_some(&finder),
2180 prefilter_case_insensitive: case_insensitive,
2181 abort_signal,
2182 },
2183 |file_bytes: &[u8], max_matches: usize| {
2184 let state = SinkState {
2185 file_index: 0,
2186 matches: Vec::with_capacity(4),
2187 max_matches,
2188 before_context: options.before_context,
2189 after_context: options.after_context,
2190 classify_definitions: options.classify_definitions,
2191 };
2192
2193 match regex {
2194 Some(ref re) => {
2195 let regex_matcher = RegexMatcher {
2196 regex: re,
2197 is_multiline,
2198 };
2199 let mut sink = RegexSink { state, re };
2200 if let Err(e) = searcher.search_slice(®ex_matcher, file_bytes, &mut sink) {
2201 tracing::error!(error = %e, "Grep (regex) search failed");
2202 }
2203 sink.state.matches
2204 }
2205 None => {
2206 let mut sink = PlainTextSink {
2207 state,
2208 finder: &finder,
2209 pattern_len,
2210 case_insensitive,
2211 };
2212 if let Err(e) = searcher.search_slice(&plain_matcher, file_bytes, &mut sink) {
2213 tracing::error!(error = %e, "Grep (plain text) search failed");
2214 }
2215 sink.state.matches
2216 }
2217 }
2218 },
2219 );
2220 result.regex_fallback_error = regex_fallback_error;
2221 result
2222}
2223
2224pub fn parse_grep_query(query: &str) -> FFFQuery<'_> {
2225 let parser = QueryParser::new(GrepConfig);
2226 parser.parse(query)
2227}
2228
2229fn strip_file_path_constraints<'a>(
2230 constraints: &[Constraint<'a>],
2231) -> Option<fff_query_parser::ConstraintVec<'a>> {
2232 if !constraints
2233 .iter()
2234 .any(|c| matches!(c, Constraint::FilePath(_)))
2235 {
2236 return None;
2237 }
2238
2239 let filtered: fff_query_parser::ConstraintVec<'a> = constraints
2240 .iter()
2241 .filter(|c| !matches!(c, Constraint::FilePath(_)))
2242 .cloned()
2243 .collect();
2244
2245 Some(filtered)
2246}
2247
2248#[cfg(test)]
2249mod tests {
2250 use super::*;
2251
2252 #[test]
2253 fn test_unescaped_newline_detection() {
2254 assert!(has_unescaped_newline_escape("foo\\nbar"));
2256 assert!(!has_unescaped_newline_escape("foo\\\\nvim-data"));
2259 assert!(!has_unescaped_newline_escape(
2262 r#"format!("{}\\AppData\\Local\\nvim-data","#
2263 ));
2264 assert!(!has_unescaped_newline_escape("hello world"));
2266 assert!(!has_unescaped_newline_escape("foo\\\\\\\\nbar"));
2268 assert!(has_unescaped_newline_escape("foo\\\\\\nbar"));
2270 }
2271
2272 #[test]
2273 fn test_replace_unescaped_newline() {
2274 assert_eq!(replace_unescaped_newline_escapes("foo\\nbar"), "foo\nbar");
2276 assert_eq!(
2278 replace_unescaped_newline_escapes("foo\\\\nvim"),
2279 "foo\\\\nvim"
2280 );
2281 }
2282
2283 #[test]
2284 fn test_fuzzy_typo_scoring() {
2285 let needle = "schema";
2287 let max_typos = (needle.len() / 3).min(2); let config = neo_frizbee::Config {
2289 max_typos: Some(max_typos as u16),
2290 sort: false,
2291 scoring: neo_frizbee::Scoring {
2292 exact_match_bonus: 100,
2293 ..neo_frizbee::Scoring::default()
2294 },
2295 };
2296 let min_matched = needle.len().saturating_sub(1).max(1); let max_match_span = needle.len() + 4; let passes = |n: &str, h: &str| -> bool {
2301 let Some(mut mi) = neo_frizbee::match_list_indices(n, &[h], &config)
2302 .into_iter()
2303 .next()
2304 else {
2305 return false;
2306 };
2307 mi.indices.sort_unstable();
2309 if mi.indices.len() < min_matched {
2310 return false;
2311 }
2312 if let (Some(&first), Some(&last)) = (mi.indices.first(), mi.indices.last()) {
2313 let span = last - first + 1;
2314 if span > max_match_span {
2315 return false;
2316 }
2317 let density = (mi.indices.len() * 100) / span;
2318 if density < 70 {
2319 return false;
2320 }
2321 }
2322 true
2323 };
2324
2325 assert!(passes("schema", "schema"));
2327 assert!(passes("schema", " schema: String,"));
2329 assert!(passes("schema", "pub fn validate_schema() {}"));
2331 assert!(passes("shcema", "schema"));
2333 assert!(!passes("schema", "it has ema in it"));
2335 assert!(!passes("schema", "hello world foo bar"));
2337 }
2338
2339 #[test]
2340 fn test_multi_grep_search() {
2341 use crate::file_picker::{FilePicker, FilePickerOptions};
2342 use std::io::Write;
2343
2344 let dir = tempfile::tempdir().unwrap();
2345
2346 {
2348 let mut f = std::fs::File::create(dir.path().join("grep.rs")).unwrap();
2349 writeln!(f, "pub enum GrepMode {{").unwrap();
2350 writeln!(f, " PlainText,").unwrap();
2351 writeln!(f, " Regex,").unwrap();
2352 writeln!(f, "}}").unwrap();
2353 writeln!(f, "pub struct GrepMatch {{").unwrap();
2354 writeln!(f, " pub line_number: u64,").unwrap();
2355 writeln!(f, "}}").unwrap();
2356 }
2357
2358 {
2360 let mut f = std::fs::File::create(dir.path().join("matcher.rs")).unwrap();
2361 writeln!(f, "struct PlainTextMatcher {{").unwrap();
2362 writeln!(f, " needle: Vec<u8>,").unwrap();
2363 writeln!(f, "}}").unwrap();
2364 }
2365
2366 {
2368 let mut f = std::fs::File::create(dir.path().join("other.rs")).unwrap();
2369 writeln!(f, "fn main() {{").unwrap();
2370 writeln!(f, " println!(\"hello\");").unwrap();
2371 writeln!(f, "}}").unwrap();
2372 }
2373
2374 let mut picker = FilePicker::new(FilePickerOptions {
2375 base_path: dir.path().to_str().unwrap().into(),
2376 watch: false,
2377 ..Default::default()
2378 })
2379 .unwrap();
2380 picker.collect_files().unwrap();
2381
2382 let files = picker.get_files();
2383 let arena = picker.arena_base_ptr();
2384
2385 let options = super::GrepSearchOptions {
2386 max_file_size: 10 * 1024 * 1024,
2387 max_matches_per_file: 0,
2388 smart_case: true,
2389 file_offset: 0,
2390 page_limit: 100,
2391 mode: super::GrepMode::PlainText,
2392 time_budget_ms: 0,
2393 before_context: 0,
2394 after_context: 0,
2395 classify_definitions: false,
2396 trim_whitespace: false,
2397 abort_signal: None,
2398 };
2399 let no_cancel = AtomicBool::new(false);
2400
2401 let result = super::multi_grep_search(
2403 files,
2404 &["GrepMode", "GrepMatch", "PlainTextMatcher"],
2405 &[],
2406 &options,
2407 picker.cache_budget(),
2408 None,
2409 None,
2410 &no_cancel,
2411 dir.path(),
2412 arena,
2413 arena,
2414 );
2415
2416 assert!(
2417 result.matches.len() >= 3,
2418 "Expected at least 3 matches, got {}",
2419 result.matches.len()
2420 );
2421
2422 let has_grep_mode = result
2423 .matches
2424 .iter()
2425 .any(|m| m.line_content.contains("GrepMode"));
2426 let has_grep_match = result
2427 .matches
2428 .iter()
2429 .any(|m| m.line_content.contains("GrepMatch"));
2430 let has_plain_text_matcher = result
2431 .matches
2432 .iter()
2433 .any(|m| m.line_content.contains("PlainTextMatcher"));
2434
2435 assert!(has_grep_mode, "Should find GrepMode");
2436 assert!(has_grep_match, "Should find GrepMatch");
2437 assert!(has_plain_text_matcher, "Should find PlainTextMatcher");
2438
2439 assert_eq!(result.files.len(), 2, "Should match exactly 2 files");
2440
2441 let result2 = super::multi_grep_search(
2443 files,
2444 &["PlainTextMatcher"],
2445 &[],
2446 &options,
2447 picker.cache_budget(),
2448 None,
2449 None,
2450 &no_cancel,
2451 dir.path(),
2452 arena,
2453 arena,
2454 );
2455 assert_eq!(
2456 result2.matches.len(),
2457 1,
2458 "Single pattern should find 1 match"
2459 );
2460
2461 let result3 = super::multi_grep_search(
2463 files,
2464 &[],
2465 &[],
2466 &options,
2467 picker.cache_budget(),
2468 None,
2469 None,
2470 &no_cancel,
2471 dir.path(),
2472 arena,
2473 arena,
2474 );
2475 assert_eq!(
2476 result3.matches.len(),
2477 0,
2478 "Empty patterns should find nothing"
2479 );
2480 }
2481
2482 #[test]
2489 fn test_grep_no_duplicates_with_overflow_trailing_bits() {
2490 use crate::bigram_filter::{BigramIndexBuilder, BigramOverlay};
2491 use crate::file_picker::{FilePicker, FilePickerOptions};
2492 use std::io::Write;
2493 use std::sync::atomic::AtomicBool;
2494
2495 let dir = tempfile::tempdir().unwrap();
2496
2497 let base_contents: &[(&str, &str)] = &[
2502 ("a.txt", "hello unicorn world"),
2503 ("b.txt", "another unicorn line"),
2504 ("c.txt", "one more unicorn here"),
2505 ("d.txt", "nothing special in here"),
2506 ("e.txt", "just some random content"),
2507 ];
2508 for (name, content) in base_contents {
2509 let mut f = std::fs::File::create(dir.path().join(name)).unwrap();
2510 writeln!(f, "{}", content).unwrap();
2511 }
2512
2513 let mut picker = FilePicker::new(FilePickerOptions {
2514 base_path: dir.path().to_str().unwrap().into(),
2515 watch: false,
2516 ..Default::default()
2517 })
2518 .unwrap();
2519 picker.collect_files().unwrap();
2520 assert_eq!(picker.get_files().len(), 5);
2521
2522 let base_count = 5usize;
2524 let consec_builder = BigramIndexBuilder::new(base_count);
2525 let skip_builder = BigramIndexBuilder::new(base_count);
2526 for (i, (_, content)) in base_contents.iter().enumerate() {
2527 consec_builder.add_file_content(&skip_builder, i, content.as_bytes());
2528 }
2529 let mut index = consec_builder.compress(Some(0));
2530 index.set_skip_index(skip_builder.compress(Some(0)));
2531 picker.set_bigram_index(index, BigramOverlay::new(base_count));
2532
2533 for name in ["f.txt", "g.txt", "h.txt"] {
2536 let path = dir.path().join(name);
2537 let mut f = std::fs::File::create(&path).unwrap();
2538 writeln!(f, "overflow unicorn entry").unwrap();
2539 drop(f);
2540 picker.on_create_or_modify(&path);
2541 }
2542 assert_eq!(picker.get_files().len(), 8);
2543
2544 let overflow_rel = "g.txt"; let overflow_abs = picker
2552 .get_files()
2553 .iter()
2554 .position(|f| f.relative_path(&picker) == overflow_rel)
2555 .expect("overflow file should be present");
2556 assert!(overflow_abs >= base_count);
2557 assert!(
2558 overflow_abs < 64,
2559 "index must fit in the single bitset word"
2560 );
2561
2562 if let Some(overlay) = picker.bigram_overlay() {
2563 overlay
2564 .write()
2565 .modify_file(overflow_abs, b"overflow unicorn entry");
2566 }
2567
2568 let query = super::parse_grep_query("unicorn");
2571 let options = super::GrepSearchOptions {
2572 max_file_size: 10 * 1024 * 1024,
2573 max_matches_per_file: 0,
2574 smart_case: true,
2575 file_offset: 0,
2576 page_limit: 100,
2577 mode: super::GrepMode::PlainText,
2578 time_budget_ms: 0,
2579 before_context: 0,
2580 after_context: 0,
2581 classify_definitions: false,
2582 trim_whitespace: false,
2583 abort_signal: Some(std::sync::Arc::new(AtomicBool::new(false))),
2584 };
2585 let result = picker.grep(&query, &options);
2586
2587 let mut paths: Vec<String> = result
2589 .files
2590 .iter()
2591 .map(|f| f.relative_path(&picker))
2592 .collect();
2593 paths.sort();
2594
2595 let mut dedup = paths.clone();
2597 dedup.dedup();
2598 assert_eq!(
2599 dedup, paths,
2600 "grep must not return duplicate results (issue #407): {:?}",
2601 paths
2602 );
2603 assert_eq!(
2604 paths,
2605 vec!["a.txt", "b.txt", "c.txt", "f.txt", "g.txt", "h.txt"],
2606 );
2607
2608 assert_eq!(
2612 result.matches.len(),
2613 6,
2614 "expected exactly one match per file, got {}",
2615 result.matches.len()
2616 );
2617 }
2618}