fff_search/
grep.rs

1//! High-performance grep engine for live content search.
2//!
3//! Searches file contents using the `grep-searcher` crate with mmap-backed
4//! file access. Files are searched in frecency order for optimal pagination
5//! performance — the most relevant files are searched first, enabling early
6//! termination once enough results are collected.
7
8use crate::{
9    BigramFilter, BigramOverlay,
10    bigram_query::{fuzzy_to_bigram_query, regex_to_bigram_query},
11    constraints::apply_constraints,
12    extract_bigrams,
13    sort_buffer::sort_with_buffer,
14    types::{ContentCacheBudget, FileItem},
15};
16use aho_corasick::AhoCorasick;
17pub use fff_grep::{
18    Searcher, SearcherBuilder, Sink, SinkMatch,
19    lines::{self, LineStep},
20    matcher::{Match, Matcher, NoError},
21};
22use fff_query_parser::{Constraint, FFFQuery, GrepConfig, QueryParser};
23use rayon::prelude::*;
24use smallvec::SmallVec;
25use std::path::Path;
26use std::sync::Arc;
27use std::sync::atomic::{AtomicBool, Ordering};
28use tracing::Level;
29
30/// Detect if a line looks like a code definition (struct, fn, class, etc.).
31///
32/// Used at match time to tag `GrepMatch::is_definition` so that output
33/// formatters can sort/annotate definitions without re-scanning lines.
34///
35/// Hand-rolled keyword scanner — avoids regex overhead entirely.
36/// Strips optional visibility/modifier keywords, then checks if the next
37/// token is a definition keyword followed by a word boundary.
38pub fn is_definition_line(line: &str) -> bool {
39    let s = line.trim_start().as_bytes();
40    let s = skip_modifiers(s);
41    is_definition_keyword(s)
42}
43
44/// Modifier keywords that can precede a definition keyword.
45/// Each must be followed by whitespace to be consumed.
46const MODIFIERS: &[&[u8]] = &[
47    b"pub",
48    b"export",
49    b"default",
50    b"async",
51    b"abstract",
52    b"unsafe",
53    b"static",
54    b"protected",
55    b"private",
56    b"public",
57];
58
59/// Definition keywords to detect.
60const DEF_KEYWORDS: &[&[u8]] = &[
61    b"struct",
62    b"fn",
63    b"enum",
64    b"trait",
65    b"impl",
66    b"class",
67    b"interface",
68    b"function",
69    b"def",
70    b"func",
71    b"type",
72    b"module",
73    b"object",
74];
75
76/// Skip zero or more modifier keywords (including `pub(crate)` style visibility).
77fn skip_modifiers(mut s: &[u8]) -> &[u8] {
78    loop {
79        // Handle `pub(...)` — e.g. `pub(crate)`, `pub(super)`
80        if s.starts_with(b"pub(")
81            && let Some(end) = s.iter().position(|&b| b == b')')
82        {
83            s = skip_ws(&s[end + 1..]);
84            continue;
85        }
86        let mut matched = false;
87        for &kw in MODIFIERS {
88            if s.starts_with(kw) {
89                let rest = &s[kw.len()..];
90                if rest.first().is_some_and(|b| b.is_ascii_whitespace()) {
91                    s = skip_ws(rest);
92                    matched = true;
93                    break;
94                }
95            }
96        }
97        if !matched {
98            return s;
99        }
100    }
101}
102
103/// Check if `s` starts with a definition keyword followed by a word boundary.
104fn is_definition_keyword(s: &[u8]) -> bool {
105    for &kw in DEF_KEYWORDS {
106        if s.starts_with(kw) {
107            let after = s.get(kw.len());
108            // Word boundary: end of input, or next byte is not alphanumeric/underscore
109            if after.is_none_or(|b| !b.is_ascii_alphanumeric() && *b != b'_') {
110                return true;
111            }
112        }
113    }
114    false
115}
116
117/// Skip ASCII whitespace.
118#[inline]
119fn skip_ws(s: &[u8]) -> &[u8] {
120    let n = s
121        .iter()
122        .position(|b| !b.is_ascii_whitespace())
123        .unwrap_or(s.len());
124    &s[n..]
125}
126
127/// Detect import/use lines — lower value than definitions or usages.
128///
129/// Checks if the line (after leading whitespace) starts with a common
130/// import statement prefix. Pure byte-level checks, no regex.
131pub fn is_import_line(line: &str) -> bool {
132    let s = line.trim_start().as_bytes();
133    s.starts_with(b"import ")
134        || s.starts_with(b"import\t")
135        || (s.starts_with(b"from ") && s.get(5).is_some_and(|&b| b == b'\'' || b == b'"'))
136        || s.starts_with(b"use ")
137        || s.starts_with(b"use\t")
138        || starts_with_require(s)
139        || starts_with_include(s)
140}
141
142/// Match `require(` or `require (`.
143#[inline]
144fn starts_with_require(s: &[u8]) -> bool {
145    if !s.starts_with(b"require") {
146        return false;
147    }
148    let rest = &s[b"require".len()..];
149    rest.first() == Some(&b'(') || (rest.first() == Some(&b' ') && rest.get(1) == Some(&b'('))
150}
151
152/// Match `# include ` (with optional spaces after `#`).
153#[inline]
154fn starts_with_include(s: &[u8]) -> bool {
155    if s.first() != Some(&b'#') {
156        return false;
157    }
158    let rest = skip_ws(&s[1..]);
159    rest.starts_with(b"include ") || rest.starts_with(b"include\t")
160}
161
162/// Determine whether `text` contains any regex metacharacters.
163/// Uses `regex::escape` from the regex crate as the source of truth — if the
164/// escaped form differs from the original, the text contains characters that
165/// would be interpreted as regex syntax. This is deterministic and always in
166/// sync with the regex engine (no hand-rolled heuristic to maintain).
167///
168/// Callers can use this to choose between `GrepMode::Regex` and
169/// `GrepMode::PlainText`. When `Regex` mode is chosen and the pattern turns
170/// out to be invalid, `grep_search` already falls back to plain-text matching
171/// and populates `regex_fallback_error`.
172pub fn has_regex_metacharacters(text: &str) -> bool {
173    regex::escape(text) != text
174}
175
176/// Check if `text` contains `\n` that is NOT preceded by another `\`.
177///
178/// `\n` → true (user wants multiline search)
179/// `\\n` → false (escaped backslash followed by literal `n`, e.g. `\\nvim-data`)
180#[inline]
181fn has_unescaped_newline_escape(text: &str) -> bool {
182    let bytes = text.as_bytes();
183    let mut i = 0;
184    while i < bytes.len().saturating_sub(1) {
185        if bytes[i] == b'\\' {
186            if bytes[i + 1] == b'n' {
187                // Count consecutive backslashes ending at position i
188                let mut backslash_count = 1;
189                while backslash_count <= i && bytes[i - backslash_count] == b'\\' {
190                    backslash_count += 1;
191                }
192                // Odd number of backslashes before 'n' → real \n escape
193                if backslash_count % 2 == 1 {
194                    return true;
195                }
196            }
197            // Skip past the escaped character
198            i += 2;
199        } else {
200            i += 1;
201        }
202    }
203    false
204}
205
206/// Replace only unescaped `\n` sequences with real newlines.
207///
208/// `\n` → newline character
209/// `\\n` → preserved as-is (literal backslash + `n`)
210fn replace_unescaped_newline_escapes(text: &str) -> String {
211    let bytes = text.as_bytes();
212    let mut result = Vec::with_capacity(bytes.len());
213    let mut i = 0;
214    while i < bytes.len() {
215        if bytes[i] == b'\\' && i + 1 < bytes.len() {
216            if bytes[i + 1] == b'n' {
217                let mut backslash_count = 1;
218                while backslash_count <= i && bytes[i - backslash_count] == b'\\' {
219                    backslash_count += 1;
220                }
221                if backslash_count % 2 == 1 {
222                    result.push(b'\n');
223                    i += 2;
224                    continue;
225                }
226            }
227            result.push(bytes[i]);
228            i += 1;
229        } else {
230            result.push(bytes[i]);
231            i += 1;
232        }
233    }
234    String::from_utf8(result).unwrap_or_else(|_| text.to_string())
235}
236
237/// Controls how the grep pattern is interpreted.
238#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
239pub enum GrepMode {
240    /// Default mode: the query is treated as literal text.
241    /// The pattern is searched using SIMD-accelerated `memchr::memmem`.
242    /// Special regex characters in the query have no special meaning.
243    #[default]
244    PlainText,
245    /// Regex mode: the query is treated as a regular expression.
246    /// Uses the same `grep-matcher` / `regex::bytes::Regex` engine.
247    /// Invalid regex patterns will return zero results (not an error).
248    Regex,
249    /// Fuzzy mode: the query is treated as a fuzzy needle matched against
250    /// each line using neo_frizbee's Smith-Waterman scoring. Lines are ranked
251    /// by match score. Individual matched character positions are reported
252    /// as highlight ranges.
253    Fuzzy,
254}
255
256/// A single content match within a file.
257#[derive(Debug, Clone)]
258pub struct GrepMatch {
259    /// Index into the deduplicated `files` vec of the GrepResult.
260    pub file_index: usize,
261    /// 1-based line number.
262    pub line_number: u64,
263    /// 0-based byte column of first match start within the line.
264    pub col: usize,
265    /// Absolute byte offset of the matched line from the start of the file.
266    /// Can be used by the preview to seek directly without scanning from the top.
267    pub byte_offset: u64,
268    /// The matched line text, truncated to `MAX_LINE_DISPLAY_LEN`.
269    pub line_content: String,
270    /// Byte offsets `(start, end)` within `line_content` for each match.
271    /// Stack-allocated for the common case of ≤4 spans per line.
272    pub match_byte_offsets: SmallVec<[(u32, u32); 4]>,
273    /// Fuzzy match score from neo_frizbee (only set in Fuzzy grep mode).
274    pub fuzzy_score: Option<u16>,
275    /// Whether the matched line looks like a definition (struct, fn, class, etc.).
276    /// Computed at match time so output formatters don't need to re-scan.
277    pub is_definition: bool,
278    /// Lines before the match (for context display). Empty when context is 0.
279    pub context_before: Vec<String>,
280    /// Lines after the match (for context display). Empty when context is 0.
281    pub context_after: Vec<String>,
282}
283
284impl GrepMatch {
285    /// Strip leading whitespace from `line_content` and all context lines,
286    /// adjusting `col` and `match_byte_offsets` so highlights remain correct.
287    pub fn trim_leading_whitespace(&mut self) {
288        let strip_len = self.line_content.len() - self.line_content.trim_start().len();
289        if strip_len > 0 {
290            self.line_content.drain(..strip_len);
291            let off = strip_len as u32;
292            self.col = self.col.saturating_sub(strip_len);
293            for range in &mut self.match_byte_offsets {
294                range.0 = range.0.saturating_sub(off);
295                range.1 = range.1.saturating_sub(off);
296            }
297        }
298        for line in &mut self.context_before {
299            let n = line.len() - line.trim_start().len();
300            if n > 0 {
301                line.drain(..n);
302            }
303        }
304        for line in &mut self.context_after {
305            let n = line.len() - line.trim_start().len();
306            if n > 0 {
307                line.drain(..n);
308            }
309        }
310    }
311}
312
313/// Result of a grep search.
314#[derive(Debug, Clone, Default)]
315pub struct GrepResult<'a> {
316    pub matches: Vec<GrepMatch>,
317    /// Deduplicated file references for the returned matches.
318    pub files: Vec<&'a FileItem>,
319    /// Number of files actually searched in this call.
320    pub total_files_searched: usize,
321    /// Total number of indexed files (before filtering).
322    pub total_files: usize,
323    /// Total number of searchable files (after filtering out binary, too-large, etc.).
324    pub filtered_file_count: usize,
325    /// Number of files that contained at least one match.
326    pub files_with_matches: usize,
327    /// The file offset to pass for the next page. `0` if there are no more files.
328    /// Callers should store this and pass it as `file_offset` in the next call.
329    pub next_file_offset: usize,
330    /// When regex mode fails to compile the pattern, the search falls back to
331    /// literal matching and this field contains the compilation error message.
332    /// The UI can display this to inform the user their regex was invalid.
333    pub regex_fallback_error: Option<String>,
334}
335
336/// Options for grep search.
337#[derive(Debug, Clone)]
338pub struct GrepSearchOptions {
339    pub max_file_size: u64,
340    pub max_matches_per_file: usize,
341    pub smart_case: bool,
342    /// File-based pagination offset: index into the sorted/filtered file list
343    /// to start searching from. Pass 0 for the first page, then use
344    /// `GrepResult::next_file_offset` for subsequent pages.
345    pub file_offset: usize,
346    /// Maximum number of matches to collect before stopping.
347    pub page_limit: usize,
348    /// How to interpret the search pattern. Defaults to `PlainText`.
349    pub mode: GrepMode,
350    /// Maximum time in milliseconds to spend searching before returning partial
351    /// results. Prevents UI freezes on pathological queries. 0 = no limit.
352    pub time_budget_ms: u64,
353    /// Number of context lines to include before each match. 0 = disabled.
354    pub before_context: usize,
355    /// Number of context lines to include after each match. 0 = disabled.
356    pub after_context: usize,
357    /// Whether to classify each match as a definition line. Adds ~2% overhead
358    /// on large repos; disable for interactive grep where it is not needed.
359    pub classify_definitions: bool,
360    /// Strip leading whitespace from matched lines and context lines, adjusting
361    /// highlight byte offsets accordingly. Useful for AI/MCP consumers and UIs
362    /// that don't need indentation. Default: false.
363    pub trim_whitespace: bool,
364    /// External abort signal. When provided, overrides the picker's internal
365    /// cancellation flag. Set to `true` to stop the search early and return
366    /// partial results. Omit (or use `..Default::default()`) to let the
367    /// picker manage cancellation.
368    pub abort_signal: Option<Arc<AtomicBool>>,
369}
370
371impl Default for GrepSearchOptions {
372    fn default() -> Self {
373        Self {
374            max_file_size: 10 * 1024 * 1024,
375            max_matches_per_file: 200,
376            smart_case: true,
377            file_offset: 0,
378            page_limit: 50,
379            mode: GrepMode::default(),
380            time_budget_ms: 0,
381            before_context: 0,
382            after_context: 0,
383            classify_definitions: false,
384            trim_whitespace: false,
385            abort_signal: None,
386        }
387    }
388}
389
390#[derive(Clone, Copy)]
391struct GrepContext<'a, 'b> {
392    total_files: usize,
393    filtered_file_count: usize,
394    budget: &'a ContentCacheBudget,
395    base_path: &'a Path,
396    arena: crate::simd_path::ArenaPtr,
397    overflow_arena: crate::simd_path::ArenaPtr,
398    prefilter: Option<&'a memchr::memmem::Finder<'b>>,
399    prefilter_case_insensitive: bool,
400    abort_signal: &'a AtomicBool,
401}
402
403impl GrepContext<'_, '_> {
404    #[inline]
405    fn arena_for_file(&self, file: &FileItem) -> crate::simd_path::ArenaPtr {
406        if file.is_overflow() {
407            self.overflow_arena
408        } else {
409            self.arena
410        }
411    }
412}
413
414/// Lightweight wrapper around `regex::bytes::Regex` implementing the
415/// `grep_matcher::Matcher` trait required by `grep-searcher`.
416///
417/// When `is_multiline` is false (the common case), we report `\n` as the
418/// line terminator. This enables the **fast** search path in `fff-searcher`:
419/// the searcher calls `find()` once on the entire remaining buffer, letting
420/// the regex DFA skip non-matching content in a single pass.
421///
422/// For multiline patterns we must NOT report a line terminator — the regex
423/// can match across line boundaries, so the searcher needs the `MultiLine`
424/// strategy.
425struct RegexMatcher<'r> {
426    regex: &'r regex::bytes::Regex,
427    is_multiline: bool,
428}
429
430impl Matcher for RegexMatcher<'_> {
431    type Error = NoError;
432
433    #[inline]
434    fn find_at(&self, haystack: &[u8], at: usize) -> Result<Option<Match>, NoError> {
435        Ok(self
436            .regex
437            .find_at(haystack, at)
438            .map(|m| Match::new(m.start(), m.end())))
439    }
440
441    #[inline]
442    fn line_terminator(&self) -> Option<fff_grep::LineTerminator> {
443        if self.is_multiline {
444            None
445        } else {
446            Some(fff_grep::LineTerminator::byte(b'\n'))
447        }
448    }
449}
450
451/// A `grep_matcher::Matcher` backed by `memchr::memmem` for literal search.
452///
453/// This is used in `PlainText` mode and is significantly faster than regex
454/// for literal patterns: memchr uses SIMD (AVX2/NEON) two-way substring
455/// search internally, avoiding the overhead of regex compilation and DFA
456/// state transitions.
457///
458/// Always reports `\n` as line terminator so the searcher uses the fast
459/// candidate-line path (plain text can never span lines unless `\n` is
460/// literally in the needle, which we handle separately).
461struct PlainTextMatcher<'a> {
462    /// Case-folded needle bytes for case-insensitive matching.
463    /// When case-sensitive, this is the original pattern bytes.
464    needle: &'a [u8],
465    case_insensitive: bool,
466}
467
468impl Matcher for PlainTextMatcher<'_> {
469    type Error = NoError;
470
471    #[inline]
472    fn find_at(&self, haystack: &[u8], at: usize) -> Result<Option<Match>, NoError> {
473        let hay = &haystack[at..];
474
475        let found = if self.case_insensitive {
476            // ASCII case-insensitive: lowercase the haystack slice on the fly.
477            // We scan with a rolling window to avoid allocating a full copy.
478            ascii_case_insensitive_find(hay, self.needle)
479        } else {
480            memchr::memmem::find(hay, self.needle)
481        };
482
483        Ok(found.map(|pos| Match::new(at + pos, at + pos + self.needle.len())))
484    }
485
486    #[inline]
487    fn line_terminator(&self) -> Option<fff_grep::LineTerminator> {
488        Some(fff_grep::LineTerminator::byte(b'\n'))
489    }
490}
491
492/// ASCII case-insensitive substring search.
493///
494/// Uses a SIMD-accelerated two-byte scan (first + last byte of needle) via
495/// `memchr2_iter`, then verifies candidates with a fast byte comparison that
496/// leverages the fact that ASCII case differs only in bit 0x20.
497#[inline]
498fn ascii_case_insensitive_find(haystack: &[u8], needle_lower: &[u8]) -> Option<usize> {
499    let nlen = needle_lower.len();
500    if nlen == 0 {
501        return Some(0);
502    }
503
504    if haystack.len() < nlen {
505        return None;
506    }
507
508    let first_lo = needle_lower[0];
509    let first_hi = first_lo.to_ascii_uppercase();
510
511    // Single-byte needle: just find either case variant.
512    if nlen == 1 {
513        return memchr::memchr2(first_lo, first_hi, haystack);
514    }
515
516    let tail = &needle_lower[1..];
517    let end = haystack.len() - nlen;
518
519    // Scan for candidates where the first byte matches (either case).
520    for pos in memchr::memchr2_iter(first_lo, first_hi, &haystack[..=end]) {
521        // Verify the remaining bytes with bitwise ASCII case-insensitive compare.
522        // For ASCII letters, (a ^ b) & ~0x20 == 0 when they match ignoring case.
523        // For non-letters, exact equality is required; OR-ing with 0x20 maps both
524        // cases to lowercase and is correct for non-alpha bytes that are already equal.
525        let candidate = unsafe { haystack.get_unchecked(pos + 1..pos + nlen) };
526        if ascii_case_eq(candidate, tail) {
527            return Some(pos);
528        }
529    }
530    None
531}
532
533/// Fast ASCII case-insensitive byte slice comparison.
534///
535/// Returns true if `a` and `b` are equal when compared case-insensitively
536/// for ASCII bytes. Both slices must have the same length.
537#[inline]
538fn ascii_case_eq(a: &[u8], b: &[u8]) -> bool {
539    debug_assert_eq!(a.len(), b.len());
540    // Process 8 bytes at a time using u64 bitwise operations.
541    // For each byte: (x | 0x20) maps uppercase ASCII to lowercase.
542    // This is correct for letters. For non-letter bytes where the original
543    // values are equal, OR-ing with 0x20 preserves equality. For non-letter
544    // bytes where values differ, this can produce false positives only when
545    // they differ exactly by 0x20 — we do a fast exact-match check first
546    // to catch those rare cases.
547    let len = a.len();
548    let mut i = 0;
549
550    // Fast path: compare 8 bytes at a time
551    while i + 8 <= len {
552        let va = u64::from_ne_bytes(unsafe { *(a.as_ptr().add(i) as *const [u8; 8]) });
553        let vb = u64::from_ne_bytes(unsafe { *(b.as_ptr().add(i) as *const [u8; 8]) });
554
555        // Quick exact-match shortcut (common for non-alpha content)
556        if va != vb {
557            // Case-insensitive: OR each byte with 0x20 to fold case
558            const MASK: u64 = 0x2020_2020_2020_2020;
559            if (va | MASK) != (vb | MASK) {
560                return false;
561            }
562        }
563        i += 8;
564    }
565
566    // Handle remaining bytes
567    while i < len {
568        let ha = unsafe { *a.get_unchecked(i) };
569        let hb = unsafe { *b.get_unchecked(i) };
570        if ha != hb && (ha | 0x20) != (hb | 0x20) {
571            return false;
572        }
573        i += 1;
574    }
575
576    true
577}
578
579/// Maximum bytes of a matched line to keep for display. Prevents minified
580/// JS or huge single-line files from blowing up memory.
581const MAX_LINE_DISPLAY_LEN: usize = 512;
582
583struct SinkState {
584    file_index: usize,
585    matches: Vec<GrepMatch>,
586    max_matches: usize,
587    before_context: usize,
588    after_context: usize,
589    classify_definitions: bool,
590}
591
592impl SinkState {
593    #[inline]
594    fn prepare_line<'a>(line_bytes: &'a [u8], mat: &SinkMatch<'_>) -> (&'a [u8], u32, u64, u64) {
595        let line_number = mat.line_number().unwrap_or(0);
596        let byte_offset = mat.absolute_byte_offset();
597
598        // Trim trailing newline/CR directly on bytes to avoid UTF-8 conversion.
599        let trimmed_len = {
600            let mut len = line_bytes.len();
601            while len > 0 && matches!(line_bytes[len - 1], b'\n' | b'\r') {
602                len -= 1;
603            }
604            len
605        };
606        let trimmed_bytes = &line_bytes[..trimmed_len];
607
608        // Truncate for display (floor to a char boundary).
609        let display_bytes = truncate_display_bytes(trimmed_bytes);
610
611        let display_len = display_bytes.len() as u32;
612        (display_bytes, display_len, line_number, byte_offset)
613    }
614
615    #[inline]
616    #[allow(clippy::too_many_arguments)]
617    fn push_match(
618        &mut self,
619        line_number: u64,
620        col: usize,
621        byte_offset: u64,
622        line_content: String,
623        match_byte_offsets: SmallVec<[(u32, u32); 4]>,
624        context_before: Vec<String>,
625        context_after: Vec<String>,
626    ) {
627        let is_definition = self.classify_definitions && is_definition_line(&line_content);
628        self.matches.push(GrepMatch {
629            file_index: self.file_index,
630            line_number,
631            col,
632            byte_offset,
633            line_content,
634            match_byte_offsets,
635            fuzzy_score: None,
636            is_definition,
637            context_before,
638            context_after,
639        });
640    }
641
642    /// Extract context lines from the full buffer around a matched region.
643    fn extract_context(&self, mat: &SinkMatch<'_>) -> (Vec<String>, Vec<String>) {
644        if self.before_context == 0 && self.after_context == 0 {
645            return (Vec::new(), Vec::new());
646        }
647
648        let buffer = mat.buffer();
649        let range = mat.bytes_range_in_buffer();
650
651        let mut before = Vec::new();
652        if self.before_context > 0 && range.start > 0 {
653            // Walk backward from the start of the match line to find preceding lines
654            let mut pos = range.start;
655            let mut lines_found = 0;
656            while lines_found < self.before_context && pos > 0 {
657                // Skip the newline just before our current position
658                pos -= 1;
659                // Find the previous newline
660                let line_start = match memchr::memrchr(b'\n', &buffer[..pos]) {
661                    Some(nl) => nl + 1,
662                    None => 0,
663                };
664                let line = &buffer[line_start..pos];
665                // Trim trailing \r
666                let line = if line.last() == Some(&b'\r') {
667                    &line[..line.len() - 1]
668                } else {
669                    line
670                };
671                let truncated = truncate_display_bytes(line);
672                before.push(String::from_utf8_lossy(truncated).into_owned());
673                pos = line_start;
674                lines_found += 1;
675            }
676            before.reverse();
677        }
678
679        let mut after = Vec::new();
680        if self.after_context > 0 && range.end < buffer.len() {
681            let mut pos = range.end;
682            let mut lines_found = 0;
683            while lines_found < self.after_context && pos < buffer.len() {
684                // Find the next newline
685                let line_end = match memchr::memchr(b'\n', &buffer[pos..]) {
686                    Some(nl) => pos + nl,
687                    None => buffer.len(),
688                };
689                let line = &buffer[pos..line_end];
690                // Trim trailing \r
691                let line = if line.last() == Some(&b'\r') {
692                    &line[..line.len() - 1]
693                } else {
694                    line
695                };
696                let truncated = truncate_display_bytes(line);
697                after.push(String::from_utf8_lossy(truncated).into_owned());
698                pos = if line_end < buffer.len() {
699                    line_end + 1 // skip past \n
700                } else {
701                    buffer.len()
702                };
703                lines_found += 1;
704            }
705        }
706
707        (before, after)
708    }
709}
710
711/// Truncate a byte slice for display, respecting UTF-8 char boundaries.
712#[inline]
713fn truncate_display_bytes(bytes: &[u8]) -> &[u8] {
714    if bytes.len() <= MAX_LINE_DISPLAY_LEN {
715        bytes
716    } else {
717        let mut end = MAX_LINE_DISPLAY_LEN;
718        while end > 0 && !is_utf8_char_boundary(bytes[end]) {
719            end -= 1;
720        }
721        &bytes[..end]
722    }
723}
724
725/// Sink for `PlainText` mode.
726///
727/// Highlights are extracted with SIMD-accelerated `memchr::memmem::Finder`.
728/// Case-insensitive matching lowercases the line into a stack buffer before
729/// searching, keeping positions 1:1 for ASCII.
730/// No regex engine is involved at any point.
731struct PlainTextSink<'r> {
732    state: SinkState,
733    finder: &'r memchr::memmem::Finder<'r>,
734    pattern_len: u32,
735    case_insensitive: bool,
736}
737
738impl Sink for PlainTextSink<'_> {
739    type Error = std::io::Error;
740
741    fn matched(&mut self, _searcher: &Searcher, mat: &SinkMatch<'_>) -> Result<bool, Self::Error> {
742        if self.state.max_matches != 0 && self.state.matches.len() >= self.state.max_matches {
743            return Ok(false);
744        }
745
746        let line_bytes = mat.bytes();
747        let (display_bytes, display_len, line_number, byte_offset) =
748            SinkState::prepare_line(line_bytes, mat);
749
750        let line_content = String::from_utf8_lossy(display_bytes).into_owned();
751        let mut match_byte_offsets: SmallVec<[(u32, u32); 4]> = SmallVec::new();
752        let mut col = 0usize;
753        let mut first = true;
754
755        if self.case_insensitive {
756            // Lowercase the display bytes into a stack buffer; positions are 1:1
757            // for ASCII so no mapping is needed.
758            let mut lowered = [0u8; MAX_LINE_DISPLAY_LEN];
759            let len = display_bytes.len().min(MAX_LINE_DISPLAY_LEN);
760            for (dst, &src) in lowered[..len].iter_mut().zip(display_bytes) {
761                *dst = src.to_ascii_lowercase();
762            }
763
764            let mut start_pos = 0usize;
765            while let Some(pos) = self.finder.find(&lowered[start_pos..len]) {
766                let abs_start = (start_pos + pos) as u32;
767                let abs_end = (abs_start + self.pattern_len).min(display_len);
768                if first {
769                    col = abs_start as usize;
770                    first = false;
771                }
772                match_byte_offsets.push((abs_start, abs_end));
773                start_pos += pos + 1;
774            }
775        } else {
776            let mut start_pos = 0usize;
777            while let Some(pos) = self.finder.find(&display_bytes[start_pos..]) {
778                let abs_start = (start_pos + pos) as u32;
779                let abs_end = (abs_start + self.pattern_len).min(display_len);
780                if first {
781                    col = abs_start as usize;
782                    first = false;
783                }
784                match_byte_offsets.push((abs_start, abs_end));
785                start_pos += pos + 1;
786            }
787        }
788
789        let (context_before, context_after) = self.state.extract_context(mat);
790        self.state.push_match(
791            line_number,
792            col,
793            byte_offset,
794            line_content,
795            match_byte_offsets,
796            context_before,
797            context_after,
798        );
799        Ok(true)
800    }
801
802    fn finish(&mut self, _: &Searcher, _: &fff_grep::SinkFinish) -> Result<(), Self::Error> {
803        Ok(())
804    }
805}
806
807/// Sink for `Regex` mode.
808///
809/// Uses the compiled regex to extract precise variable-length highlight spans
810/// from each matched line. No `memmem` finder is involved.
811struct RegexSink<'r> {
812    state: SinkState,
813    re: &'r regex::bytes::Regex,
814}
815
816impl Sink for RegexSink<'_> {
817    type Error = std::io::Error;
818
819    fn matched(
820        &mut self,
821        _searcher: &Searcher,
822        sink_match: &SinkMatch<'_>,
823    ) -> Result<bool, Self::Error> {
824        if self.state.max_matches != 0 && self.state.matches.len() >= self.state.max_matches {
825            return Ok(false);
826        }
827
828        let line_bytes = sink_match.bytes();
829        let (display_bytes, display_len, line_number, byte_offset) =
830            SinkState::prepare_line(line_bytes, sink_match);
831
832        let line_content = String::from_utf8_lossy(display_bytes).into_owned();
833        let mut match_byte_offsets: SmallVec<[(u32, u32); 4]> = SmallVec::new();
834        let mut col = 0usize;
835        let mut first = true;
836
837        for m in self.re.find_iter(display_bytes) {
838            let abs_start = m.start() as u32;
839            let abs_end = (m.end() as u32).min(display_len);
840            if first {
841                col = abs_start as usize;
842                first = false;
843            }
844            match_byte_offsets.push((abs_start, abs_end));
845        }
846
847        let (context_before, context_after) = self.state.extract_context(sink_match);
848        self.state.push_match(
849            line_number,
850            col,
851            byte_offset,
852            line_content,
853            match_byte_offsets,
854            context_before,
855            context_after,
856        );
857        Ok(true)
858    }
859
860    fn finish(&mut self, _: &Searcher, _: &fff_grep::SinkFinish) -> Result<(), Self::Error> {
861        Ok(())
862    }
863}
864
865/// A `grep_matcher::Matcher` backed by Aho-Corasick for multi-pattern search.
866///
867/// Finds the first occurrence of any pattern starting at the given offset.
868/// Always reports `\n` as the line terminator for the fast candidate-line path.
869struct AhoCorasickMatcher<'a> {
870    ac: &'a AhoCorasick,
871}
872
873impl Matcher for AhoCorasickMatcher<'_> {
874    type Error = NoError;
875
876    #[inline]
877    fn find_at(&self, haystack: &[u8], at: usize) -> std::result::Result<Option<Match>, NoError> {
878        let hay = &haystack[at..];
879        let found: Option<aho_corasick::Match> = self.ac.find(hay);
880        Ok(found.map(|m| Match::new(at + m.start(), at + m.end())))
881    }
882
883    #[inline]
884    fn line_terminator(&self) -> Option<fff_grep::LineTerminator> {
885        Some(fff_grep::LineTerminator::byte(b'\n'))
886    }
887}
888
889/// Sink for Aho-Corasick multi-pattern mode.
890///
891/// Collects all pattern match positions on each matched line for highlighting.
892struct AhoCorasickSink<'a> {
893    state: SinkState,
894    ac: &'a AhoCorasick,
895}
896
897impl Sink for AhoCorasickSink<'_> {
898    type Error = std::io::Error;
899
900    fn matched(&mut self, _searcher: &Searcher, mat: &SinkMatch<'_>) -> Result<bool, Self::Error> {
901        if self.state.max_matches != 0 && self.state.matches.len() >= self.state.max_matches {
902            return Ok(false);
903        }
904
905        let line_bytes = mat.bytes();
906        let (display_bytes, display_len, line_number, byte_offset) =
907            SinkState::prepare_line(line_bytes, mat);
908
909        let line_content = String::from_utf8_lossy(display_bytes).into_owned();
910        let mut match_byte_offsets: SmallVec<[(u32, u32); 4]> = SmallVec::new();
911        let mut col = 0usize;
912        let mut first = true;
913
914        for m in self.ac.find_iter(display_bytes as &[u8]) {
915            let abs_start = m.start() as u32;
916            let abs_end = (m.end() as u32).min(display_len);
917            if first {
918                col = abs_start as usize;
919                first = false;
920            }
921            match_byte_offsets.push((abs_start, abs_end));
922        }
923
924        let (context_before, context_after) = self.state.extract_context(mat);
925        self.state.push_match(
926            line_number,
927            col,
928            byte_offset,
929            line_content,
930            match_byte_offsets,
931            context_before,
932            context_after,
933        );
934        Ok(true)
935    }
936
937    fn finish(&mut self, _: &Searcher, _: &fff_grep::SinkFinish) -> Result<(), Self::Error> {
938        Ok(())
939    }
940}
941
942/// Multi-pattern OR search using Aho-Corasick.
943///
944/// Builds a single automaton from all patterns and searches each file in one
945/// pass. This is significantly faster than regex alternation for literal text
946/// searches because Aho-Corasick uses SIMD-accelerated multi-needle matching.
947///
948/// Returns the same `GrepResult` type as `grep_search`.
949#[allow(clippy::too_many_arguments)]
950pub(crate) fn multi_grep_search<'a>(
951    files: &'a [FileItem],
952    patterns: &[&str],
953    constraints: &[fff_query_parser::Constraint<'_>],
954    options: &GrepSearchOptions,
955    budget: &ContentCacheBudget,
956    bigram_index: Option<&BigramFilter>,
957    bigram_overlay: Option<&BigramOverlay>,
958    abort_signal: &AtomicBool,
959    base_path: &Path,
960    arena: crate::simd_path::ArenaPtr,
961    overflow_arena: crate::simd_path::ArenaPtr,
962) -> GrepResult<'a> {
963    let total_files = files.len();
964
965    if patterns.is_empty() || patterns.iter().all(|p| p.is_empty()) {
966        return GrepResult {
967            total_files,
968            filtered_file_count: total_files,
969            ..Default::default()
970        };
971    }
972
973    // Bigram prefiltering: OR the candidate bitsets for each pattern.
974    // A file is a candidate if it matches ANY of the patterns' bigrams.
975    let bigram_candidates = if let Some(idx) = bigram_index
976        && idx.is_ready()
977    {
978        let mut combined: Option<Vec<u64>> = None;
979        for pattern in patterns {
980            if let Some(candidates) = idx.query(pattern.as_bytes()) {
981                combined = Some(match combined {
982                    None => candidates,
983                    Some(mut acc) => {
984                        // OR: file is candidate if it matches any pattern
985                        acc.iter_mut()
986                            .zip(candidates.iter())
987                            .for_each(|(a, b)| *a |= *b);
988                        acc
989                    }
990                });
991            }
992        }
993
994        if let Some(ref mut candidates) = combined
995            && let Some(overlay) = bigram_overlay
996        {
997            for pattern in patterns {
998                let pattern_bigrams = extract_bigrams(pattern.as_bytes());
999                for file_idx in overlay.query_modified(&pattern_bigrams) {
1000                    let word = file_idx / 64;
1001                    if word < candidates.len() {
1002                        candidates[word] |= 1u64 << (file_idx % 64);
1003                    }
1004                }
1005            }
1006        }
1007
1008        combined
1009    } else {
1010        None
1011    };
1012
1013    let (mut files_to_search, mut filtered_file_count) =
1014        prepare_files_to_search(files, constraints, options, arena);
1015
1016    // If constraints yielded 0 files and we had FilePath constraints,
1017    // retry without them (the path token was likely part of the search text).
1018    if files_to_search.is_empty()
1019        && let Some(stripped) = strip_file_path_constraints(constraints)
1020    {
1021        let (retry_files, retry_count) = prepare_files_to_search(files, &stripped, options, arena);
1022        files_to_search = retry_files;
1023        filtered_file_count = retry_count;
1024    }
1025
1026    // Apply bigram prefilter to the file list
1027    if let Some(ref candidates) = bigram_candidates {
1028        let base_ptr = files.as_ptr();
1029        files_to_search.retain(|f| {
1030            if f.is_overflow() {
1031                return true;
1032            }
1033            let file_idx = unsafe { (*f as *const FileItem).offset_from(base_ptr) as usize };
1034            BigramFilter::is_candidate(candidates, file_idx)
1035        });
1036    }
1037
1038    if files_to_search.is_empty() {
1039        return GrepResult {
1040            total_files,
1041            filtered_file_count,
1042            ..Default::default()
1043        };
1044    }
1045
1046    // Smart case: case-insensitive when all patterns are lowercase
1047    let case_insensitive = if options.smart_case {
1048        !patterns.iter().any(|p| p.chars().any(|c| c.is_uppercase()))
1049    } else {
1050        false
1051    };
1052
1053    let ac = aho_corasick::AhoCorasickBuilder::new()
1054        .ascii_case_insensitive(case_insensitive)
1055        .build(patterns)
1056        .expect("Aho-Corasick build should not fail for literal patterns");
1057
1058    let searcher = {
1059        let mut b = SearcherBuilder::new();
1060        b.line_number(true);
1061        b
1062    }
1063    .build();
1064
1065    let ac_matcher = AhoCorasickMatcher { ac: &ac };
1066    perform_grep(
1067        &files_to_search,
1068        options,
1069        &GrepContext {
1070            total_files,
1071            filtered_file_count,
1072            budget,
1073            base_path,
1074            arena,
1075            overflow_arena,
1076            prefilter: None, // no memmem prefilter for multi-pattern search
1077            prefilter_case_insensitive: false,
1078            abort_signal,
1079        },
1080        |file_bytes: &[u8], max_matches: usize| {
1081            let state = SinkState {
1082                file_index: 0,
1083                matches: Vec::with_capacity(4),
1084                max_matches,
1085                before_context: options.before_context,
1086                after_context: options.after_context,
1087                classify_definitions: options.classify_definitions,
1088            };
1089
1090            let mut sink = AhoCorasickSink { state, ac: &ac };
1091
1092            if let Err(e) = searcher.search_slice(&ac_matcher, file_bytes, &mut sink) {
1093                tracing::error!(error = %e, "Grep (aho-corasick multi) search failed");
1094            }
1095
1096            sink.state.matches
1097        },
1098    )
1099}
1100
1101// copied from the rust u8 private method
1102#[inline]
1103const fn is_utf8_char_boundary(b: u8) -> bool {
1104    (b as i8) >= -0x40
1105}
1106
1107/// Build a regex from the user's grep text.
1108///
1109/// In `PlainText` mode:
1110/// - Escapes the input for literal matching (users type text, not regex)
1111/// - Applies smart case: case-insensitive unless query has uppercase
1112/// - Detects `\n` for multiline
1113///
1114/// In `Regex` mode:
1115/// - The input is passed directly to the regex engine without escaping
1116/// - Smart case still applies
1117/// - Returns `None` for invalid regex patterns — the caller falls back to literal mode
1118fn build_regex(pattern: &str, smart_case: bool) -> Result<regex::bytes::Regex, String> {
1119    if pattern.is_empty() {
1120        return Err("empty pattern".to_string());
1121    }
1122
1123    let regex_pattern = if pattern.contains("\\n") {
1124        pattern.replace("\\n", "\n")
1125    } else {
1126        pattern.to_string()
1127    };
1128
1129    let case_insensitive = if smart_case {
1130        !pattern.chars().any(|c| c.is_uppercase())
1131    } else {
1132        false
1133    };
1134
1135    regex::bytes::RegexBuilder::new(&regex_pattern)
1136        .case_insensitive(case_insensitive)
1137        .multi_line(true)
1138        .unicode(false)
1139        .build()
1140        .map_err(|e| e.to_string())
1141}
1142
1143/// Convert character-position indices from neo_frizbee into byte-offset
1144/// pairs (start, end) suitable for `match_byte_offsets`.
1145///
1146/// frizbee returns character positions (0-based index into the char
1147/// iterator). We need byte ranges because the UI renderer and Lua layer
1148/// use byte offsets for extmark highlights.
1149///
1150/// Each matched character becomes its own (byte_start, byte_end) pair.
1151/// Adjacent characters are merged into a single contiguous range.
1152fn char_indices_to_byte_offsets(line: &str, char_indices: &[usize]) -> SmallVec<[(u32, u32); 4]> {
1153    if char_indices.is_empty() {
1154        return SmallVec::new();
1155    }
1156
1157    // Build a map: char_index -> (byte_start, byte_end) for all chars.
1158    // Iterating all chars is O(n) in the line length which is bounded by MAX_LINE_DISPLAY_LEN (512).
1159    let char_byte_ranges: Vec<(usize, usize)> = line
1160        .char_indices()
1161        .map(|(byte_pos, ch)| (byte_pos, byte_pos + ch.len_utf8()))
1162        .collect();
1163
1164    // Convert char indices to byte ranges, merging adjacent ranges
1165    let mut result: SmallVec<[(u32, u32); 4]> = SmallVec::with_capacity(char_indices.len());
1166
1167    for &ci in char_indices {
1168        if ci >= char_byte_ranges.len() {
1169            continue; // out of bounds (shouldn't happen with valid data)
1170        }
1171        let (start, end) = char_byte_ranges[ci];
1172        // Merge with previous range if adjacent
1173        if let Some(last) = result.last_mut()
1174            && last.1 == start as u32
1175        {
1176            last.1 = end as u32;
1177            continue;
1178        }
1179        result.push((start as u32, end as u32));
1180    }
1181
1182    result
1183}
1184
1185use crate::case_insensitive_memmem;
1186
1187/// Minimum chunk size for paginated search. Must be large enough for good
1188/// thread utilization across rayon's pool (~28 threads on modern hardware)
1189/// but small enough to allow early termination after few chunks.
1190const PAGINATED_CHUNK_SIZE: usize = 512;
1191
1192#[tracing::instrument(skip_all, level = Level::DEBUG, fields(prefiltered_count = files_to_search.len()))]
1193fn perform_grep<'a, F>(
1194    files_to_search: &[&'a FileItem],
1195    options: &GrepSearchOptions,
1196    ctx: &GrepContext<'_, '_>,
1197    search_file: F,
1198) -> GrepResult<'a>
1199where
1200    F: Fn(&[u8], usize) -> Vec<GrepMatch> + Sync,
1201{
1202    let time_budget = if options.time_budget_ms > 0 {
1203        Some(std::time::Duration::from_millis(options.time_budget_ms))
1204    } else {
1205        None
1206    };
1207
1208    let search_start = std::time::Instant::now();
1209    let page_limit = options.page_limit;
1210    let budget_exceeded = AtomicBool::new(false);
1211
1212    // For paginated searches, process files in chunks to enable early
1213    // termination. Each chunk is searched in parallel with rayon; between
1214    // chunks we check whether enough matches have been collected.
1215    //
1216    // For full searches (page_limit = MAX), one chunk = all files — same
1217    // throughput as before, no overhead from the chunking loop.
1218    //
1219    // For common queries ("x", "if") with ~99% hit rate: the first 512-file
1220    // chunk yields ~500 matches, far exceeding page_limit=50. We stop after
1221    // one chunk (~1ms) instead of searching all 93K files (~175ms).
1222    let chunk_size = if page_limit < usize::MAX {
1223        PAGINATED_CHUNK_SIZE
1224    } else {
1225        files_to_search.len().max(1)
1226    };
1227
1228    let mut result_files: Vec<&'a FileItem> = Vec::new();
1229    let mut all_matches: Vec<GrepMatch> = Vec::new();
1230    let mut files_consumed: usize = 0;
1231    let mut page_filled = false;
1232
1233    for chunk in files_to_search.chunks(chunk_size) {
1234        let chunk_offset = files_consumed;
1235
1236        // Parallel phase: search all files in this chunk concurrently.
1237        // Within a chunk every file is visited (no gaps), so pagination
1238        // offsets remain correct across chunk boundaries.
1239        let chunk_results: Vec<(usize, &'a FileItem, Vec<GrepMatch>)> = chunk
1240            .par_iter()
1241            .enumerate()
1242            .map_init(
1243                // allocatge a single reusable buffer per thread
1244                || Vec::with_capacity(64 * 1024),
1245                |buf, (local_idx, file)| {
1246                    if ctx.abort_signal.load(Ordering::Relaxed) {
1247                        budget_exceeded.store(true, Ordering::Relaxed);
1248                        return None;
1249                    }
1250
1251                    if let Some(budget) = time_budget
1252                        && all_matches.len() > 1
1253                        && search_start.elapsed() > budget
1254                    {
1255                        budget_exceeded.store(true, Ordering::Relaxed);
1256                        return None;
1257                    }
1258
1259                    let content = file.get_content_for_search(
1260                        buf,
1261                        ctx.arena_for_file(file),
1262                        ctx.base_path,
1263                        ctx.budget,
1264                    )?;
1265
1266                    // Fast whole-file memmem check before entering the
1267                    // grep-searcher machinery. Skips Vec alloc, Searcher
1268                    // setup, and line-splitting for files that can't match.
1269                    if let Some(pf) = ctx.prefilter {
1270                        let found = if ctx.prefilter_case_insensitive {
1271                            case_insensitive_memmem::search_packed_pair(content, pf.needle())
1272                        } else {
1273                            pf.find(content).is_some()
1274                        };
1275                        if !found {
1276                            return None;
1277                        }
1278                    }
1279
1280                    let file_matches = search_file(content, options.max_matches_per_file);
1281
1282                    if file_matches.is_empty() {
1283                        return None;
1284                    }
1285
1286                    Some((chunk_offset + local_idx, *file, file_matches))
1287                },
1288            )
1289            .flatten()
1290            .collect();
1291
1292        // Every file in the chunk was visited by rayon (matched or not).
1293        files_consumed = chunk_offset + chunk.len();
1294
1295        // Flatten this chunk's results into the accumulator.
1296        for (batch_idx, file, file_matches) in chunk_results {
1297            let file_result_idx = result_files.len();
1298            result_files.push(file);
1299
1300            for mut m in file_matches {
1301                m.file_index = file_result_idx;
1302                if options.trim_whitespace {
1303                    m.trim_leading_whitespace();
1304                }
1305                all_matches.push(m);
1306            }
1307
1308            if all_matches.len() >= page_limit {
1309                // Tighten files_consumed to the file that tipped us over so
1310                // the next page resumes right after it.
1311                files_consumed = batch_idx + 1;
1312                page_filled = true;
1313                break;
1314            }
1315        }
1316
1317        if page_filled || budget_exceeded.load(Ordering::Relaxed) {
1318            break;
1319        }
1320    }
1321
1322    // If no file had any match, we searched the entire slice.
1323    if result_files.is_empty() {
1324        files_consumed = files_to_search.len();
1325    }
1326
1327    let has_more = budget_exceeded.load(Ordering::Relaxed)
1328        || (page_filled && files_consumed < files_to_search.len());
1329
1330    let next_file_offset = if has_more {
1331        options.file_offset + files_consumed
1332    } else {
1333        0
1334    };
1335
1336    GrepResult {
1337        matches: all_matches,
1338        files_with_matches: result_files.len(),
1339        files: result_files,
1340        total_files_searched: files_consumed,
1341        total_files: ctx.total_files,
1342        filtered_file_count: ctx.filtered_file_count,
1343        next_file_offset,
1344        regex_fallback_error: None,
1345    }
1346}
1347
1348/// Flatten per-file results into the final `GrepResult`.
1349///
1350/// Shared post-processing for both `run_file_search` (simple closure) and
1351/// `fuzzy_grep_search` (which uses `map_init` for per-thread matcher reuse).
1352fn collect_grep_results<'a>(
1353    per_file_results: Vec<(usize, &'a FileItem, Vec<GrepMatch>)>,
1354    files_to_search_len: usize,
1355    options: &GrepSearchOptions,
1356    total_files: usize,
1357    filtered_file_count: usize,
1358    budget_exceeded: bool,
1359) -> GrepResult<'a> {
1360    let page_limit = options.page_limit;
1361
1362    // Each match stores a `file_index` pointing into `result_files` so that
1363    // consumers (FFI JSON, Lua) can look up file metadata without duplicating
1364    // it across every match from the same file.
1365    let mut result_files: Vec<&'a FileItem> = Vec::new();
1366    let mut all_matches: Vec<GrepMatch> = Vec::new();
1367    // files_consumed tracks how far into files_to_search we have advanced,
1368    // counting every file whose results were emitted (with or without matches).
1369    // We use the batch_idx of the last consumed file + 1, which is correct
1370    // because per_file_results only contains files that had matches, and
1371    // files between them that had no matches were still searched and can be
1372    // safely skipped on the next page.
1373    let mut files_consumed: usize = 0;
1374
1375    for (batch_idx, file, file_matches) in per_file_results {
1376        // batch_idx is the 0-based position in files_to_search.
1377        // Advance files_consumed to include this file and all no-match files before it.
1378        files_consumed = batch_idx + 1;
1379
1380        let file_result_idx = result_files.len();
1381        result_files.push(file);
1382
1383        for mut m in file_matches {
1384            m.file_index = file_result_idx;
1385            if options.trim_whitespace {
1386                m.trim_leading_whitespace();
1387            }
1388            all_matches.push(m);
1389        }
1390
1391        // page_limit is a soft cap: we always finish the current file before
1392        // stopping, so no matches are dropped. A page may return up to
1393        // page_limit + max_matches_per_file - 1 matches in the worst case.
1394        if all_matches.len() >= page_limit {
1395            break;
1396        }
1397    }
1398
1399    // If no file had any match, we searched the entire slice.
1400    if result_files.is_empty() {
1401        files_consumed = files_to_search_len;
1402    }
1403
1404    let has_more = budget_exceeded
1405        || (all_matches.len() >= page_limit && files_consumed < files_to_search_len);
1406
1407    let next_file_offset = if has_more {
1408        options.file_offset + files_consumed
1409    } else {
1410        0
1411    };
1412
1413    GrepResult {
1414        matches: all_matches,
1415        files_with_matches: result_files.len(),
1416        files: result_files,
1417        total_files_searched: files_consumed,
1418        total_files,
1419        filtered_file_count,
1420        next_file_offset,
1421        regex_fallback_error: None,
1422    }
1423}
1424
1425/// Filter files by constraints and size/binary checks, sort by frecency,
1426/// and apply file-based pagination.
1427///
1428/// Returns `(paginated_files, filtered_file_count)`. The paginated slice
1429/// is empty if the offset is past the end of available files.
1430fn prepare_files_to_search<'a>(
1431    files: &'a [FileItem],
1432    constraints: &[fff_query_parser::Constraint<'_>],
1433    options: &GrepSearchOptions,
1434    arena: crate::simd_path::ArenaPtr,
1435) -> (Vec<&'a FileItem>, usize) {
1436    let prefiltered: Vec<&FileItem> = if constraints.is_empty() {
1437        files
1438            .iter()
1439            .filter(|f| {
1440                !f.is_deleted() && !f.is_binary() && f.size > 0 && f.size <= options.max_file_size
1441            })
1442            .collect()
1443    } else {
1444        match apply_constraints(files, constraints, arena) {
1445            Some(constrained) => constrained
1446                .into_iter()
1447                .filter(|f| {
1448                    !f.is_deleted()
1449                        && !f.is_binary()
1450                        && f.size > 0
1451                        && f.size <= options.max_file_size
1452                })
1453                .collect(),
1454            None => files
1455                .iter()
1456                .filter(|f| {
1457                    !f.is_deleted()
1458                        && !f.is_binary()
1459                        && f.size > 0
1460                        && f.size <= options.max_file_size
1461                })
1462                .collect(),
1463        }
1464    };
1465
1466    let total_count = prefiltered.len();
1467    let mut sorted_files = prefiltered;
1468
1469    // Only sort when there is meaningful frecency or modification data to rank by.
1470    // On large repos (500k+ files) with no frecency data (fresh session, benchmark),
1471    // skipping the O(n log n) sort saves ~200ms per query.
1472    let needs_sort = sorted_files
1473        .iter()
1474        .any(|f| f.total_frecency_score() != 0 || f.modified != 0);
1475
1476    if needs_sort {
1477        sort_with_buffer(&mut sorted_files, |a, b| {
1478            b.total_frecency_score()
1479                .cmp(&a.total_frecency_score())
1480                .then(b.modified.cmp(&a.modified))
1481        });
1482    }
1483
1484    if options.file_offset > 0 && options.file_offset < total_count {
1485        let paginated = sorted_files.split_off(options.file_offset);
1486        (paginated, total_count)
1487    } else if options.file_offset >= total_count {
1488        (Vec::new(), total_count)
1489    } else {
1490        // offset == 0: no split needed, return as-is
1491        (sorted_files, total_count)
1492    }
1493}
1494
1495/// Fuzzy grep search using SIMD-accelerated `neo_frizbee::match_list`.
1496///
1497/// Why this doesn't use `grep-searcher` / `GrepSink`
1498///
1499/// PlainText and Regex modes use the `grep-searcher` pipeline: a `Matcher`
1500/// finds candidate lines, and a `Sink` collects them one at a time. This
1501/// works well because memchr/regex can *skip* non-matching lines in O(n)
1502/// without scoring every one.
1503///
1504/// Fuzzy matching is fundamentally different. Every line is a candidate —
1505/// the Smith-Waterman score determines whether it passes, not a substring
1506/// or pattern test. The `Matcher::find_at` trait forces per-line calls to
1507/// the *reference* (scalar) smith-waterman, which is O(needle × line_len)
1508/// per line. For a 10k-line file that's 10k sequential reference calls.
1509///
1510/// `neo_frizbee::match_list` solves this by batching lines into
1511/// fixed-width SIMD buckets (4, 8, 12 … 512 bytes) and scoring 16+
1512/// haystacks per SIMD invocation. A single `match_list` call over the
1513/// entire file replaces 10k individual `match_indices` calls. We then
1514/// call `match_indices` *only* on the ~5-20 lines that pass `min_score`
1515/// to extract character highlight positions.
1516///
1517/// Line splitting uses `memchr::memchr` (the same SIMD-accelerated byte
1518/// search that `grep-searcher` and `bstr::ByteSlice::find_byte` use
1519/// internally) to locate `\n` terminators. This gives us the same
1520/// performance as the searcher's `LineStep` iterator without pulling in
1521/// the full searcher machinery.
1522///
1523/// For each file:
1524///   1. mmap the file, split lines via memchr '\n' (tracking line numbers + byte offsets)
1525///   2. Batch all lines through `match_list` (SIMD smith-waterman)
1526///   3. Filter results by `min_score`
1527///   4. Call `match_indices` only on passing lines to get character highlight offsets
1528#[allow(clippy::too_many_arguments)]
1529fn fuzzy_grep_search<'a>(
1530    grep_text: &str,
1531    files_to_search: &[&'a FileItem],
1532    options: &GrepSearchOptions,
1533    total_files: usize,
1534    filtered_file_count: usize,
1535    case_insensitive: bool,
1536    budget: &ContentCacheBudget,
1537    abort_signal: &AtomicBool,
1538    base_path: &Path,
1539    arena: crate::simd_path::ArenaPtr,
1540    _overflow_arena: crate::simd_path::ArenaPtr,
1541) -> GrepResult<'a> {
1542    // max_typos controls how many *needle* characters can be unmatched.
1543    // A transposition (e.g. "shcema" → "schema") costs ~1 typo with
1544    // default gap penalties. We scale max_typos by needle length:
1545    //   1-2 chars → 0 typos (exact subsequence only)
1546    //   3-5 chars → 1 typo
1547    //   6+  chars → 2 typos
1548    // Cap at 2: higher values (3+) let the SIMD prefilter pass lines
1549    // missing key characters entirely (e.g. query "flvencodeX" matching
1550    // lines without 'l' or 'v'). Quality comes from the post-match filters.
1551    let max_typos = (grep_text.len() / 3).min(2);
1552    let scoring = neo_frizbee::Scoring {
1553        // Use default gap penalties. Higher values (e.g. 20) cause
1554        // smith-waterman to prefer *dropping needle chars* over paying
1555        // gap costs, which inflates the typo count and breaks
1556        // transposition matching ("shcema" → "schema" becomes 3 typos instead of 1)
1557        exact_match_bonus: 100,
1558        // gap_open_penalty: 4,
1559        // gap_extend_penalty: 2,
1560        prefix_bonus: 0,
1561        capitalization_bonus: if case_insensitive { 0 } else { 4 },
1562        ..neo_frizbee::Scoring::default()
1563    };
1564
1565    let matcher = neo_frizbee::Matcher::new(
1566        grep_text,
1567        &neo_frizbee::Config {
1568            // Use the real max_typos so frizbee's SIMD prefilter actually rejects non-matching lines (~2 SIMD instructions per line vs full SW scoring).
1569            max_typos: Some(max_typos as u16),
1570            sort: false,
1571            scoring,
1572        },
1573    );
1574
1575    // Minimum score threshold: 50% of a perfect contiguous match.
1576    // With default scoring (match_score=12, matching_case_bonus=4 = 16/char),
1577    // a transposition costs ~5 from a gap, keeping the score well above 50%.
1578    let perfect_score = (grep_text.len() as u16) * 16;
1579    let min_score = (perfect_score * 50) / 100;
1580
1581    // Target identifiers are often longer than the query due to delimiters
1582    // (e.g. query "flvencodepicture" → "ff_flv_encode_picture_header").
1583    // Allow 3x needle length to accommodate underscore/dot-separated names.
1584    let max_match_span = grep_text.len() * 3;
1585    let needle_len = grep_text.len();
1586
1587    // Each delimiter (_, .) in the target creates a gap. A typical C/Rust
1588    // identifier like "ff_flv_encode_picture_header" has 4-5 underscores.
1589    // Scale generously so delimiter gaps don't reject valid matches.
1590    let max_gaps = (needle_len / 3).max(2);
1591
1592    // File-level prefilter: collect unique needle chars (both cases) for
1593    // a fast memchr scan.  If a file doesn't contain enough distinct
1594    // needle characters, skip it entirely — no line splitting needed.
1595    let needle_bytes = grep_text.as_bytes();
1596    let mut unique_needle_chars: Vec<u8> = Vec::new();
1597    for &b in needle_bytes {
1598        let lo = b.to_ascii_lowercase();
1599        let hi = b.to_ascii_uppercase();
1600        if !unique_needle_chars.contains(&lo) {
1601            unique_needle_chars.push(lo);
1602        }
1603        if lo != hi && !unique_needle_chars.contains(&hi) {
1604            unique_needle_chars.push(hi);
1605        }
1606    }
1607
1608    // How many distinct needle chars must appear in the file.
1609    // With max_typos allowed, we need at least (unique_count - max_typos).
1610    let unique_count = {
1611        let mut seen = [false; 256];
1612        for &b in needle_bytes {
1613            seen[b.to_ascii_lowercase() as usize] = true;
1614        }
1615        seen.iter().filter(|&&v| v).count()
1616    };
1617    let min_chars_required = unique_count.saturating_sub(max_typos);
1618
1619    let time_budget = if options.time_budget_ms > 0 {
1620        Some(std::time::Duration::from_millis(options.time_budget_ms))
1621    } else {
1622        None
1623    };
1624    let search_start = std::time::Instant::now();
1625    let budget_exceeded = AtomicBool::new(false);
1626    let max_matches_per_file = options.max_matches_per_file;
1627    // Parallel phase with `map_init`: each rayon worker thread clones the
1628    // matcher once and gets a reusable read buffer. The buffer avoids
1629    // mmap/munmap syscalls for non-cached files.
1630    let per_file_results: Vec<(usize, &'a FileItem, Vec<GrepMatch>)> = files_to_search
1631        .par_iter()
1632        .enumerate()
1633        .map_init(
1634            || (matcher.clone(), Vec::with_capacity(64 * 1024)),
1635            |(matcher, buf), (idx, file)| {
1636                if abort_signal.load(Ordering::Relaxed) {
1637                    budget_exceeded.store(true, Ordering::Relaxed);
1638                    return None;
1639                }
1640
1641                if let Some(budget) = time_budget
1642                    && search_start.elapsed() > budget
1643                {
1644                    budget_exceeded.store(true, Ordering::Relaxed);
1645                    return None;
1646                }
1647
1648                let file_bytes = file.get_content_for_search(buf, arena, base_path, budget)?;
1649
1650                // File-level prefilter: check if enough distinct needle chars
1651                // exist anywhere in the file bytes.  Uses memchr for speed.
1652                if min_chars_required > 0 {
1653                    let mut chars_found = 0usize;
1654                    for &ch in &unique_needle_chars {
1655                        if memchr::memchr(ch, file_bytes).is_some() {
1656                            chars_found += 1;
1657                            if chars_found >= min_chars_required {
1658                                break;
1659                            }
1660                        }
1661                    }
1662                    if chars_found < min_chars_required {
1663                        return None;
1664                    }
1665                }
1666
1667                // Validate the whole file as UTF-8 once upfront. Source code
1668                // files are virtually always valid UTF-8; this single check
1669                // replaces per-line from_utf8 calls (~8% of fuzzy grep time).
1670                let file_is_utf8 = std::str::from_utf8(file_bytes).is_ok();
1671
1672                // Reuse grep-searcher's LineStep for SIMD-accelerated line iteration.
1673                let mut stepper = LineStep::new(b'\n', 0, file_bytes.len());
1674                let estimated_lines = (file_bytes.len() / 40).max(64);
1675                let mut file_lines: Vec<&str> = Vec::with_capacity(estimated_lines);
1676                let mut line_meta: Vec<(u64, u64)> = Vec::with_capacity(estimated_lines);
1677                let line_term_lf = fff_grep::LineTerminator::byte(b'\n');
1678                let line_term_cr = fff_grep::LineTerminator::byte(b'\r');
1679
1680                let mut line_number: u64 = 1;
1681                while let Some(line_match) = stepper.next_match(file_bytes) {
1682                    let byte_offset = line_match.start() as u64;
1683
1684                    // Strip line terminators (\n, \r).
1685                    let trimmed = lines::without_terminator(
1686                        lines::without_terminator(&file_bytes[line_match], line_term_lf),
1687                        line_term_cr,
1688                    );
1689
1690                    if !trimmed.is_empty() {
1691                        // SAFETY: when the whole file is valid UTF-8, every
1692                        // sub-slice split on ASCII byte boundaries (\n, \r)
1693                        // is also valid UTF-8.
1694                        let line_str = if file_is_utf8 {
1695                            unsafe { std::str::from_utf8_unchecked(trimmed) }
1696                        } else if let Ok(s) = std::str::from_utf8(trimmed) {
1697                            s
1698                        } else {
1699                            line_number += 1;
1700                            continue;
1701                        };
1702                        file_lines.push(line_str);
1703                        line_meta.push((line_number, byte_offset));
1704                    }
1705
1706                    line_number += 1;
1707                }
1708
1709                if file_lines.is_empty() {
1710                    return None;
1711                }
1712
1713                // Single-pass: score + indices in one Smith-Waterman run per line.
1714                let matches_with_indices = matcher.match_list_indices(&file_lines);
1715                let mut file_matches: Vec<GrepMatch> = Vec::new();
1716
1717                for mut match_indices in matches_with_indices {
1718                    if match_indices.score < min_score {
1719                        continue;
1720                    }
1721
1722                    let idx = match_indices.index as usize;
1723                    let raw_line = file_lines[idx];
1724
1725                    let truncated = truncate_display_bytes(raw_line.as_bytes());
1726                    let display_line = if truncated.len() < raw_line.len() {
1727                        // SAFETY: truncate_display_bytes preserves UTF-8 char boundaries
1728                        &raw_line[..truncated.len()]
1729                    } else {
1730                        raw_line
1731                    };
1732
1733                    // If the line was truncated, re-compute indices on the shorter string.
1734                    if display_line.len() < raw_line.len() {
1735                        let Some(re_indices) = matcher
1736                            .match_list_indices(&[display_line])
1737                            .into_iter()
1738                            .next()
1739                        else {
1740                            continue;
1741                        };
1742                        match_indices = re_indices;
1743                    }
1744
1745                    // upstream returns indices in reverse order, sort ascending
1746                    match_indices.indices.sort_unstable();
1747
1748                    // Minimum matched chars: at least (needle_len - max_typos)
1749                    // characters must appear. This is consistent with the typo
1750                    // budget: each typo can drop one needle char from the alignment.
1751                    let min_matched = needle_len.saturating_sub(max_typos).max(1);
1752                    if match_indices.indices.len() < min_matched {
1753                        continue;
1754                    }
1755
1756                    let indices = &match_indices.indices;
1757
1758                    if let (Some(&first), Some(&last)) = (indices.first(), indices.last()) {
1759                        // Span check: reject widely scattered matches.
1760                        let span = last - first + 1;
1761                        if span > max_match_span {
1762                            continue;
1763                        }
1764
1765                        // Density check: matched chars / span must be dense enough.
1766                        // Relaxed for perfect subsequence matches (all needle chars
1767                        // present), slightly relaxed for typo matches to handle
1768                        // delimiter-heavy targets (e.g. "ff_flv_encode_picture_header"
1769                        // has span inflated by underscores → density ~68%).
1770                        let density = (indices.len() * 100) / span;
1771                        let min_density = if indices.len() >= needle_len {
1772                            45 // Perfect subsequence — relaxed (delimiters inflate span)
1773                        } else {
1774                            65 // Has typos — moderately strict
1775                        };
1776                        if density < min_density {
1777                            continue;
1778                        }
1779
1780                        // Gap count check: count discontinuities in the indices.
1781                        let gap_count = indices.windows(2).filter(|w| w[1] != w[0] + 1).count();
1782                        if gap_count > max_gaps {
1783                            continue;
1784                        }
1785                    }
1786
1787                    let (ln, bo) = line_meta[idx];
1788                    let match_byte_offsets =
1789                        char_indices_to_byte_offsets(display_line, &match_indices.indices);
1790                    let col = match_byte_offsets
1791                        .first()
1792                        .map(|r| r.0 as usize)
1793                        .unwrap_or(0);
1794
1795                    file_matches.push(GrepMatch {
1796                        file_index: 0,
1797                        line_number: ln,
1798                        col,
1799                        byte_offset: bo,
1800                        is_definition: options.classify_definitions
1801                            && is_definition_line(display_line),
1802                        line_content: display_line.to_string(),
1803                        match_byte_offsets,
1804                        fuzzy_score: Some(match_indices.score),
1805                        context_before: Vec::new(),
1806                        context_after: Vec::new(),
1807                    });
1808
1809                    if max_matches_per_file != 0 && file_matches.len() >= max_matches_per_file {
1810                        break;
1811                    }
1812                }
1813
1814                if file_matches.is_empty() {
1815                    return None;
1816                }
1817
1818                Some((idx, *file, file_matches))
1819            },
1820        )
1821        .flatten()
1822        .collect();
1823
1824    collect_grep_results(
1825        per_file_results,
1826        files_to_search.len(),
1827        options,
1828        total_files,
1829        filtered_file_count,
1830        budget_exceeded.load(Ordering::Relaxed),
1831    )
1832}
1833
1834/// Perform a grep search across all indexed files.
1835///
1836/// When `query` is empty, returns git-modified/untracked files sorted by
1837/// frecency for the "welcome state" UI.
1838#[tracing::instrument(skip_all, fields(file_count = files.len()))]
1839#[allow(clippy::too_many_arguments)]
1840pub(crate) fn grep_search<'a>(
1841    files: &'a [FileItem],
1842    query: &FFFQuery<'_>,
1843    options: &GrepSearchOptions,
1844    budget: &ContentCacheBudget,
1845    bigram_index: Option<&BigramFilter>,
1846    bigram_overlay: Option<&BigramOverlay>,
1847    abort_signal: &AtomicBool,
1848    base_path: &Path,
1849    arena: crate::simd_path::ArenaPtr,
1850    overflow_arena: crate::simd_path::ArenaPtr,
1851) -> GrepResult<'a> {
1852    let total_files = files.len();
1853
1854    // Extract the grep text and file constraints from the parsed query.
1855    // For grep, the search pattern is the original query with constraint tokens
1856    // removed. All non-constraint text tokens are collected and joined with
1857    // spaces to form the grep pattern:
1858    //   "name = *.rs someth" -> grep "name = someth" with constraint Extension("rs")
1859    let constraints_from_query = &query.constraints[..];
1860
1861    let grep_text = if !matches!(query.fuzzy_query, fff_query_parser::FuzzyQuery::Empty) {
1862        query.grep_text()
1863    } else {
1864        // Constraint-only or empty query — use raw_query for backslash-escape handling.
1865        let t = query.raw_query.trim();
1866        if t.starts_with('\\') && t.len() > 1 {
1867            let suffix = &t[1..];
1868            let parser = QueryParser::new(GrepConfig);
1869            if !parser.parse(suffix).constraints.is_empty() {
1870                suffix.to_string()
1871            } else {
1872                t.to_string()
1873            }
1874        } else {
1875            t.to_string()
1876        }
1877    };
1878
1879    if grep_text.is_empty() {
1880        return GrepResult {
1881            total_files,
1882            filtered_file_count: total_files,
1883            next_file_offset: 0,
1884            matches: Vec::with_capacity(4),
1885            files: Vec::new(),
1886            ..Default::default()
1887        };
1888    }
1889
1890    let case_insensitive = if options.smart_case {
1891        !grep_text.chars().any(|c| c.is_uppercase())
1892    } else {
1893        false
1894    };
1895
1896    let mut regex_fallback_error: Option<String> = None;
1897    let regex = match options.mode {
1898        GrepMode::PlainText => None,
1899        GrepMode::Fuzzy => {
1900            let (mut files_to_search, mut filtered_file_count) =
1901                prepare_files_to_search(files, constraints_from_query, options, arena);
1902
1903            if files_to_search.is_empty()
1904                && let Some(stripped) = strip_file_path_constraints(constraints_from_query)
1905            {
1906                let (retry_files, retry_count) =
1907                    prepare_files_to_search(files, &stripped, options, arena);
1908                files_to_search = retry_files;
1909                filtered_file_count = retry_count;
1910            }
1911
1912            if files_to_search.is_empty() {
1913                return GrepResult {
1914                    total_files,
1915                    filtered_file_count,
1916                    next_file_offset: 0,
1917                    ..Default::default()
1918                };
1919            }
1920
1921            // Bigram prefilter: pick 5 evenly-spaced probe bigrams, require
1922            // (5 - max_typos) of them to appear. Widely-spaced probes are
1923            // far more selective than sliding windows of adjacent bigrams.
1924            if let Some(idx) = bigram_index
1925                && idx.is_ready()
1926            {
1927                let bq = fuzzy_to_bigram_query(&grep_text, 7);
1928                if !bq.is_any()
1929                    && let Some(mut candidates) = bq.evaluate(idx)
1930                {
1931                    if let Some(overlay) = bigram_overlay {
1932                        for (r, t) in candidates.iter_mut().zip(overlay.tombstones().iter()) {
1933                            *r &= !t;
1934                        }
1935                        // Fuzzy: conservatively add all modified files
1936                        for file_idx in overlay.modified_indices() {
1937                            let word = file_idx / 64;
1938                            if word < candidates.len() {
1939                                candidates[word] |= 1u64 << (file_idx % 64);
1940                            }
1941                        }
1942                    }
1943
1944                    let base_ptr = files.as_ptr();
1945                    files_to_search.retain(|f| {
1946                        if f.is_overflow() {
1947                            return true;
1948                        }
1949
1950                        let file_idx =
1951                            unsafe { (*f as *const FileItem).offset_from(base_ptr) as usize };
1952
1953                        BigramFilter::is_candidate(&candidates, file_idx)
1954                    });
1955                }
1956            }
1957
1958            return fuzzy_grep_search(
1959                &grep_text,
1960                &files_to_search,
1961                options,
1962                total_files,
1963                filtered_file_count,
1964                case_insensitive,
1965                budget,
1966                abort_signal,
1967                base_path,
1968                arena,
1969                overflow_arena,
1970            );
1971        }
1972        GrepMode::Regex => build_regex(&grep_text, options.smart_case)
1973            .inspect_err(|err| {
1974                tracing::warn!("Regex compilation failed for {}. Error {}", grep_text, err);
1975
1976                regex_fallback_error = Some(err.to_string());
1977            })
1978            .ok(),
1979    };
1980
1981    let is_multiline = has_unescaped_newline_escape(&grep_text);
1982
1983    let effective_pattern = if is_multiline {
1984        replace_unescaped_newline_escapes(&grep_text)
1985    } else {
1986        grep_text.to_string()
1987    };
1988
1989    // Build the finder pattern once — used by PlainTextSink (and as a
1990    // literal-needle fallback anchor when regex compilation fell back to plain).
1991    let finder_pattern: Vec<u8> = if case_insensitive {
1992        effective_pattern.as_bytes().to_ascii_lowercase()
1993    } else {
1994        effective_pattern.as_bytes().to_vec()
1995    };
1996    let finder = memchr::memmem::Finder::new(&finder_pattern);
1997    let pattern_len = finder_pattern.len() as u32;
1998
1999    // Bigram prefiltering: query the inverted index + merge overlay.
2000    // For PlainText mode: extract bigrams directly from the literal pattern.
2001    // For Regex mode: decompose the regex HIR into an AND/OR bigram query tree
2002    // and evaluate it against the inverted index (supports alternation, optional
2003    // groups, character classes, and sparse-1 bigrams across single-byte wildcards).
2004    let bigram_candidates = if let Some(idx) = bigram_index
2005        && idx.is_ready()
2006    {
2007        let raw_candidates = if regex.is_none() {
2008            // PlainText or regex-fallback-to-plain: literal bigram query
2009            idx.query(effective_pattern.as_bytes())
2010        } else {
2011            // Regex mode: decompose pattern into bigram query tree
2012            let bq = regex_to_bigram_query(&effective_pattern);
2013            if !bq.is_any() { bq.evaluate(idx) } else { None }
2014        };
2015
2016        if let Some(mut candidates) = raw_candidates {
2017            if let Some(overlay) = bigram_overlay {
2018                // Clear tombstoned (deleted) files from candidates
2019                for (r, t) in candidates.iter_mut().zip(overlay.tombstones().iter()) {
2020                    *r &= !t;
2021                }
2022
2023                if regex.is_none() {
2024                    let pattern_bigrams = extract_bigrams(effective_pattern.as_bytes());
2025                    for file_idx in overlay.query_modified(&pattern_bigrams) {
2026                        let word = file_idx / 64;
2027                        if word < candidates.len() {
2028                            candidates[word] |= 1u64 << (file_idx % 64);
2029                        }
2030                    }
2031                } else {
2032                    for file_idx in overlay.modified_indices() {
2033                        let word = file_idx / 64;
2034                        if word < candidates.len() {
2035                            candidates[word] |= 1u64 << (file_idx % 64);
2036                        }
2037                    }
2038                }
2039            }
2040            Some(candidates)
2041        } else {
2042            None
2043        }
2044    } else {
2045        None
2046    };
2047
2048    // Overflow files (added after the bigram index was built) are not in
2049    // the candidate bitset. They're few by definition, so just search all
2050    // of them directly via memchr — no bigram tracking needed.
2051    let overflow_start = bigram_overlay
2052        .map(|o| o.base_file_count())
2053        .unwrap_or(files.len());
2054
2055    // it is important that this step is coming as early as possible
2056    let (files_to_search, filtered_file_count) = match bigram_candidates {
2057        Some(ref candidates) if constraints_from_query.is_empty() => {
2058            // this call is essentially free and much more efficient than allowing a recollection
2059            let overflow_count = files.len().saturating_sub(overflow_start);
2060            let cap = BigramFilter::count_candidates(candidates) + overflow_count;
2061            let mut result: Vec<&FileItem> = Vec::with_capacity(cap);
2062
2063            for (word_idx, &word) in candidates.iter().enumerate() {
2064                if word == 0 {
2065                    continue;
2066                }
2067                let base = word_idx * 64;
2068                let mut bits = word;
2069                while bits != 0 {
2070                    let bit = bits.trailing_zeros() as usize;
2071                    let file_idx = base + bit;
2072                    // Stop at the overflow boundary: the loop below walks
2073                    // every overflow file, so counting them here too would duplicate.
2074                    if file_idx < overflow_start {
2075                        let f = unsafe { files.get_unchecked(file_idx) };
2076                        if !f.is_binary() && f.size <= options.max_file_size {
2077                            result.push(f);
2078                        }
2079                    }
2080                    bits &= bits - 1;
2081                }
2082            }
2083
2084            // Append all overflow files — they're not in the bigram index
2085            // so we search them unconditionally (typically few files).
2086            for f in &files[overflow_start..] {
2087                if !f.is_binary() && !f.is_deleted() && f.size <= options.max_file_size {
2088                    result.push(f);
2089                }
2090            }
2091
2092            let total_searchable = files.len();
2093            let needs_sort = result
2094                .iter()
2095                .any(|f| f.total_frecency_score() != 0 || f.modified != 0);
2096
2097            if needs_sort {
2098                sort_with_buffer(&mut result, |a, b| {
2099                    b.total_frecency_score()
2100                        .cmp(&a.total_frecency_score())
2101                        .then(b.modified.cmp(&a.modified))
2102                });
2103            }
2104
2105            if options.file_offset > 0 && options.file_offset < result.len() {
2106                let paginated = result.split_off(options.file_offset);
2107                (paginated, total_searchable)
2108            } else if options.file_offset >= result.len() {
2109                (Vec::new(), total_searchable)
2110            } else {
2111                (result, total_searchable)
2112            }
2113        }
2114        _ => {
2115            let (mut fts, mut fc) =
2116                prepare_files_to_search(files, constraints_from_query, options, arena);
2117
2118            if fts.is_empty()
2119                && let Some(stripped) = strip_file_path_constraints(constraints_from_query)
2120            {
2121                let (retry_files, retry_count) =
2122                    prepare_files_to_search(files, &stripped, options, arena);
2123                fts = retry_files;
2124                fc = retry_count;
2125            }
2126
2127            if let Some(ref candidates) = bigram_candidates {
2128                let base_ptr = files.as_ptr();
2129                fts.retain(|f| {
2130                    if f.is_overflow() {
2131                        return true;
2132                    }
2133
2134                    // we use ptr offsets to avoid additional allocations and keep the index
2135                    let file_idx =
2136                        unsafe { (*f as *const FileItem).offset_from(base_ptr) as usize };
2137                    BigramFilter::is_candidate(candidates, file_idx)
2138                });
2139            }
2140
2141            (fts, fc)
2142        }
2143    };
2144
2145    if files_to_search.is_empty() {
2146        return GrepResult {
2147            total_files,
2148            filtered_file_count,
2149            next_file_offset: 0,
2150            ..Default::default()
2151        };
2152    }
2153
2154    // `PlainTextMatcher` is used by the grep-searcher engine for line detection.
2155    // `PlainTextSink` / `RegexSink` handle highlight extraction independently.
2156    let plain_matcher = PlainTextMatcher {
2157        needle: &finder_pattern,
2158        case_insensitive,
2159    };
2160
2161    let searcher = {
2162        let mut b = SearcherBuilder::new();
2163        b.line_number(true).multi_line(is_multiline);
2164        b
2165    }
2166    .build();
2167
2168    let should_prefilter = regex.is_none();
2169    let mut result = perform_grep(
2170        &files_to_search,
2171        options,
2172        &GrepContext {
2173            total_files,
2174            filtered_file_count,
2175            budget,
2176            base_path,
2177            arena,
2178            overflow_arena,
2179            prefilter: should_prefilter.then_some(&finder),
2180            prefilter_case_insensitive: case_insensitive,
2181            abort_signal,
2182        },
2183        |file_bytes: &[u8], max_matches: usize| {
2184            let state = SinkState {
2185                file_index: 0,
2186                matches: Vec::with_capacity(4),
2187                max_matches,
2188                before_context: options.before_context,
2189                after_context: options.after_context,
2190                classify_definitions: options.classify_definitions,
2191            };
2192
2193            match regex {
2194                Some(ref re) => {
2195                    let regex_matcher = RegexMatcher {
2196                        regex: re,
2197                        is_multiline,
2198                    };
2199                    let mut sink = RegexSink { state, re };
2200                    if let Err(e) = searcher.search_slice(&regex_matcher, file_bytes, &mut sink) {
2201                        tracing::error!(error = %e, "Grep (regex) search failed");
2202                    }
2203                    sink.state.matches
2204                }
2205                None => {
2206                    let mut sink = PlainTextSink {
2207                        state,
2208                        finder: &finder,
2209                        pattern_len,
2210                        case_insensitive,
2211                    };
2212                    if let Err(e) = searcher.search_slice(&plain_matcher, file_bytes, &mut sink) {
2213                        tracing::error!(error = %e, "Grep (plain text) search failed");
2214                    }
2215                    sink.state.matches
2216                }
2217            }
2218        },
2219    );
2220    result.regex_fallback_error = regex_fallback_error;
2221    result
2222}
2223
2224pub fn parse_grep_query(query: &str) -> FFFQuery<'_> {
2225    let parser = QueryParser::new(GrepConfig);
2226    parser.parse(query)
2227}
2228
2229fn strip_file_path_constraints<'a>(
2230    constraints: &[Constraint<'a>],
2231) -> Option<fff_query_parser::ConstraintVec<'a>> {
2232    if !constraints
2233        .iter()
2234        .any(|c| matches!(c, Constraint::FilePath(_)))
2235    {
2236        return None;
2237    }
2238
2239    let filtered: fff_query_parser::ConstraintVec<'a> = constraints
2240        .iter()
2241        .filter(|c| !matches!(c, Constraint::FilePath(_)))
2242        .cloned()
2243        .collect();
2244
2245    Some(filtered)
2246}
2247
2248#[cfg(test)]
2249mod tests {
2250    use super::*;
2251
2252    #[test]
2253    fn test_unescaped_newline_detection() {
2254        // Single \n → multiline
2255        assert!(has_unescaped_newline_escape("foo\\nbar"));
2256        // \\n → escaped backslash + literal n, NOT multiline
2257        // (this is what the user types when grepping Rust source with `\\nvim`)
2258        assert!(!has_unescaped_newline_escape("foo\\\\nvim-data"));
2259        // Real-world: source file has literal \\AppData\\Local\\nvim-data
2260        // (double backslash in the file, so user types double backslash)
2261        assert!(!has_unescaped_newline_escape(
2262            r#"format!("{}\\AppData\\Local\\nvim-data","#
2263        ));
2264        // No \n at all
2265        assert!(!has_unescaped_newline_escape("hello world"));
2266        // \\\\n → even number of backslashes before n → NOT multiline
2267        assert!(!has_unescaped_newline_escape("foo\\\\\\\\nbar"));
2268        // \\\n → 3 backslashes: first two pair up, third + n = \n → multiline
2269        assert!(has_unescaped_newline_escape("foo\\\\\\nbar"));
2270    }
2271
2272    #[test]
2273    fn test_replace_unescaped_newline() {
2274        // \n → real newline
2275        assert_eq!(replace_unescaped_newline_escapes("foo\\nbar"), "foo\nbar");
2276        // \\n → preserved as-is
2277        assert_eq!(
2278            replace_unescaped_newline_escapes("foo\\\\nvim"),
2279            "foo\\\\nvim"
2280        );
2281    }
2282
2283    #[test]
2284    fn test_fuzzy_typo_scoring() {
2285        // Mirror the config from fuzzy_grep_search
2286        let needle = "schema";
2287        let max_typos = (needle.len() / 3).min(2); // 2
2288        let config = neo_frizbee::Config {
2289            max_typos: Some(max_typos as u16),
2290            sort: false,
2291            scoring: neo_frizbee::Scoring {
2292                exact_match_bonus: 100,
2293                ..neo_frizbee::Scoring::default()
2294            },
2295        };
2296        let min_matched = needle.len().saturating_sub(1).max(1); // 5
2297        let max_match_span = needle.len() + 4; // 10
2298
2299        // Helper: check if a match would pass our post-filters
2300        let passes = |n: &str, h: &str| -> bool {
2301            let Some(mut mi) = neo_frizbee::match_list_indices(n, &[h], &config)
2302                .into_iter()
2303                .next()
2304            else {
2305                return false;
2306            };
2307            // upstream returns indices in reverse order, sort ascending
2308            mi.indices.sort_unstable();
2309            if mi.indices.len() < min_matched {
2310                return false;
2311            }
2312            if let (Some(&first), Some(&last)) = (mi.indices.first(), mi.indices.last()) {
2313                let span = last - first + 1;
2314                if span > max_match_span {
2315                    return false;
2316                }
2317                let density = (mi.indices.len() * 100) / span;
2318                if density < 70 {
2319                    return false;
2320                }
2321            }
2322            true
2323        };
2324
2325        // Exact match: must pass
2326        assert!(passes("schema", "schema"));
2327        // Exact in longer line: must pass
2328        assert!(passes("schema", "  schema: String,"));
2329        // In identifier: must pass
2330        assert!(passes("schema", "pub fn validate_schema() {}"));
2331        // Transposition: must pass
2332        assert!(passes("shcema", "schema"));
2333        // Partial "ema" only line: must NOT pass
2334        assert!(!passes("schema", "it has ema in it"));
2335        // Completely unrelated: must NOT pass
2336        assert!(!passes("schema", "hello world foo bar"));
2337    }
2338
2339    #[test]
2340    fn test_multi_grep_search() {
2341        use crate::file_picker::{FilePicker, FilePickerOptions};
2342        use std::io::Write;
2343
2344        let dir = tempfile::tempdir().unwrap();
2345
2346        // File 1: has "GrepMode" and "GrepMatch"
2347        {
2348            let mut f = std::fs::File::create(dir.path().join("grep.rs")).unwrap();
2349            writeln!(f, "pub enum GrepMode {{").unwrap();
2350            writeln!(f, "    PlainText,").unwrap();
2351            writeln!(f, "    Regex,").unwrap();
2352            writeln!(f, "}}").unwrap();
2353            writeln!(f, "pub struct GrepMatch {{").unwrap();
2354            writeln!(f, "    pub line_number: u64,").unwrap();
2355            writeln!(f, "}}").unwrap();
2356        }
2357
2358        // File 2: has "PlainTextMatcher" only
2359        {
2360            let mut f = std::fs::File::create(dir.path().join("matcher.rs")).unwrap();
2361            writeln!(f, "struct PlainTextMatcher {{").unwrap();
2362            writeln!(f, "    needle: Vec<u8>,").unwrap();
2363            writeln!(f, "}}").unwrap();
2364        }
2365
2366        // File 3: no matches
2367        {
2368            let mut f = std::fs::File::create(dir.path().join("other.rs")).unwrap();
2369            writeln!(f, "fn main() {{").unwrap();
2370            writeln!(f, "    println!(\"hello\");").unwrap();
2371            writeln!(f, "}}").unwrap();
2372        }
2373
2374        let mut picker = FilePicker::new(FilePickerOptions {
2375            base_path: dir.path().to_str().unwrap().into(),
2376            watch: false,
2377            ..Default::default()
2378        })
2379        .unwrap();
2380        picker.collect_files().unwrap();
2381
2382        let files = picker.get_files();
2383        let arena = picker.arena_base_ptr();
2384
2385        let options = super::GrepSearchOptions {
2386            max_file_size: 10 * 1024 * 1024,
2387            max_matches_per_file: 0,
2388            smart_case: true,
2389            file_offset: 0,
2390            page_limit: 100,
2391            mode: super::GrepMode::PlainText,
2392            time_budget_ms: 0,
2393            before_context: 0,
2394            after_context: 0,
2395            classify_definitions: false,
2396            trim_whitespace: false,
2397            abort_signal: None,
2398        };
2399        let no_cancel = AtomicBool::new(false);
2400
2401        // Test with 3 patterns
2402        let result = super::multi_grep_search(
2403            files,
2404            &["GrepMode", "GrepMatch", "PlainTextMatcher"],
2405            &[],
2406            &options,
2407            picker.cache_budget(),
2408            None,
2409            None,
2410            &no_cancel,
2411            dir.path(),
2412            arena,
2413            arena,
2414        );
2415
2416        assert!(
2417            result.matches.len() >= 3,
2418            "Expected at least 3 matches, got {}",
2419            result.matches.len()
2420        );
2421
2422        let has_grep_mode = result
2423            .matches
2424            .iter()
2425            .any(|m| m.line_content.contains("GrepMode"));
2426        let has_grep_match = result
2427            .matches
2428            .iter()
2429            .any(|m| m.line_content.contains("GrepMatch"));
2430        let has_plain_text_matcher = result
2431            .matches
2432            .iter()
2433            .any(|m| m.line_content.contains("PlainTextMatcher"));
2434
2435        assert!(has_grep_mode, "Should find GrepMode");
2436        assert!(has_grep_match, "Should find GrepMatch");
2437        assert!(has_plain_text_matcher, "Should find PlainTextMatcher");
2438
2439        assert_eq!(result.files.len(), 2, "Should match exactly 2 files");
2440
2441        // Test with single pattern
2442        let result2 = super::multi_grep_search(
2443            files,
2444            &["PlainTextMatcher"],
2445            &[],
2446            &options,
2447            picker.cache_budget(),
2448            None,
2449            None,
2450            &no_cancel,
2451            dir.path(),
2452            arena,
2453            arena,
2454        );
2455        assert_eq!(
2456            result2.matches.len(),
2457            1,
2458            "Single pattern should find 1 match"
2459        );
2460
2461        // Test with empty patterns
2462        let result3 = super::multi_grep_search(
2463            files,
2464            &[],
2465            &[],
2466            &options,
2467            picker.cache_budget(),
2468            None,
2469            None,
2470            &no_cancel,
2471            dir.path(),
2472            arena,
2473            arena,
2474        );
2475        assert_eq!(
2476            result3.matches.len(),
2477            0,
2478            "Empty patterns should find nothing"
2479        );
2480    }
2481
2482    /// Regression test for issue #407: Live grep returns duplicate results
2483    /// when the bigram candidate bitset has trailing bits set beyond
2484    /// `base_file_count`. The bitset is rounded up to a multiple of 64 bits
2485    /// so any trailing bit that happens to be set (e.g. from overlay data)
2486    /// would previously map to an overflow file index, which was then also
2487    /// unconditionally appended by the overflow loop, producing duplicates.
2488    #[test]
2489    fn test_grep_no_duplicates_with_overflow_trailing_bits() {
2490        use crate::bigram_filter::{BigramIndexBuilder, BigramOverlay};
2491        use crate::file_picker::{FilePicker, FilePickerOptions};
2492        use std::io::Write;
2493        use std::sync::atomic::AtomicBool;
2494
2495        let dir = tempfile::tempdir().unwrap();
2496
2497        // Five base files: only three contain the pattern "unicorn".
2498        // We need some files WITHOUT the pattern so the bigrams for
2499        // "unicorn" aren't treated as ubiquitous (≥90% of files) and
2500        // dropped from the index during compress().
2501        let base_contents: &[(&str, &str)] = &[
2502            ("a.txt", "hello unicorn world"),
2503            ("b.txt", "another unicorn line"),
2504            ("c.txt", "one more unicorn here"),
2505            ("d.txt", "nothing special in here"),
2506            ("e.txt", "just some random content"),
2507        ];
2508        for (name, content) in base_contents {
2509            let mut f = std::fs::File::create(dir.path().join(name)).unwrap();
2510            writeln!(f, "{}", content).unwrap();
2511        }
2512
2513        let mut picker = FilePicker::new(FilePickerOptions {
2514            base_path: dir.path().to_str().unwrap().into(),
2515            watch: false,
2516            ..Default::default()
2517        })
2518        .unwrap();
2519        picker.collect_files().unwrap();
2520        assert_eq!(picker.get_files().len(), 5);
2521
2522        // Manually build a bigram index over the 5 base files.
2523        let base_count = 5usize;
2524        let consec_builder = BigramIndexBuilder::new(base_count);
2525        let skip_builder = BigramIndexBuilder::new(base_count);
2526        for (i, (_, content)) in base_contents.iter().enumerate() {
2527            consec_builder.add_file_content(&skip_builder, i, content.as_bytes());
2528        }
2529        let mut index = consec_builder.compress(Some(0));
2530        index.set_skip_index(skip_builder.compress(Some(0)));
2531        picker.set_bigram_index(index, BigramOverlay::new(base_count));
2532
2533        // Add three overflow files (new after the bigram index was built),
2534        // all containing "unicorn".
2535        for name in ["f.txt", "g.txt", "h.txt"] {
2536            let path = dir.path().join(name);
2537            let mut f = std::fs::File::create(&path).unwrap();
2538            writeln!(f, "overflow unicorn entry").unwrap();
2539            drop(f);
2540            picker.on_create_or_modify(&path);
2541        }
2542        assert_eq!(picker.get_files().len(), 8);
2543
2544        // Inject a trailing bit into the overlay at a file index that
2545        // corresponds to an overflow file (i.e. >= base_file_count=5 but
2546        // < bitset_word_size=64). Without the fix, the bigram-candidate
2547        // merge would set this bit in the bitset, and the bitset loop would
2548        // push files[6] while the overflow loop also appends files[5..]
2549        // which includes files[6], producing a duplicate.
2550        let overflow_rel = "g.txt"; // middle overflow file
2551        let overflow_abs = picker
2552            .get_files()
2553            .iter()
2554            .position(|f| f.relative_path(&picker) == overflow_rel)
2555            .expect("overflow file should be present");
2556        assert!(overflow_abs >= base_count);
2557        assert!(
2558            overflow_abs < 64,
2559            "index must fit in the single bitset word"
2560        );
2561
2562        if let Some(overlay) = picker.bigram_overlay() {
2563            overlay
2564                .write()
2565                .modify_file(overflow_abs, b"overflow unicorn entry");
2566        }
2567
2568        // Run a grep for "unicorn": six files match
2569        // (a, b, c in base + f, g, h in overflow).
2570        let query = super::parse_grep_query("unicorn");
2571        let options = super::GrepSearchOptions {
2572            max_file_size: 10 * 1024 * 1024,
2573            max_matches_per_file: 0,
2574            smart_case: true,
2575            file_offset: 0,
2576            page_limit: 100,
2577            mode: super::GrepMode::PlainText,
2578            time_budget_ms: 0,
2579            before_context: 0,
2580            after_context: 0,
2581            classify_definitions: false,
2582            trim_whitespace: false,
2583            abort_signal: Some(std::sync::Arc::new(AtomicBool::new(false))),
2584        };
2585        let result = picker.grep(&query, &options);
2586
2587        // Collect the matched relative paths via the returned files list.
2588        let mut paths: Vec<String> = result
2589            .files
2590            .iter()
2591            .map(|f| f.relative_path(&picker))
2592            .collect();
2593        paths.sort();
2594
2595        // Every file (base + overflow) should match exactly once.
2596        let mut dedup = paths.clone();
2597        dedup.dedup();
2598        assert_eq!(
2599            dedup, paths,
2600            "grep must not return duplicate results (issue #407): {:?}",
2601            paths
2602        );
2603        assert_eq!(
2604            paths,
2605            vec!["a.txt", "b.txt", "c.txt", "f.txt", "g.txt", "h.txt"],
2606        );
2607
2608        // And the match count must equal the number of files (one line per
2609        // file). A duplicate entry in files_to_search would double-count
2610        // matches for the duplicated file.
2611        assert_eq!(
2612            result.matches.len(),
2613            6,
2614            "expected exactly one match per file, got {}",
2615            result.matches.len()
2616        );
2617    }
2618}
fff_search/grep.rs

fff_search/
grep.rs