fff_search/
grep.rs

1//! High-performance grep engine for live content search.
2//!
3//! Searches file contents using the `grep-searcher` crate with mmap-backed
4//! file access. Files are searched in frecency order for optimal pagination
5//! performance — the most relevant files are searched first, enabling early
6//! termination once enough results are collected.
7
8use crate::constraints::apply_constraints;
9use crate::sort_buffer::sort_with_buffer;
10use crate::types::FileItem;
11use aho_corasick::AhoCorasick;
12use fff_grep::lines::{self, LineStep};
13use fff_grep::{Searcher, SearcherBuilder, Sink, SinkMatch};
14use fff_query_parser::{Constraint, FFFQuery, GrepConfig, QueryParser};
15use grep_matcher::{Match, Matcher, NoCaptures, NoError};
16use rayon::prelude::*;
17use smallvec::SmallVec;
18use std::sync::atomic::{AtomicBool, Ordering};
19use tracing::Level;
20
21/// Detect if a line looks like a code definition (struct, fn, class, etc.).
22///
23/// Used at match time to tag `GrepMatch::is_definition` so that output
24/// formatters can sort/annotate definitions without re-scanning lines.
25///
26/// Hand-rolled keyword scanner — avoids regex overhead entirely.
27/// Strips optional visibility/modifier keywords, then checks if the next
28/// token is a definition keyword followed by a word boundary.
29pub fn is_definition_line(line: &str) -> bool {
30    let s = line.trim_start().as_bytes();
31    let s = skip_modifiers(s);
32    is_definition_keyword(s)
33}
34
35/// Modifier keywords that can precede a definition keyword.
36/// Each must be followed by whitespace to be consumed.
37const MODIFIERS: &[&[u8]] = &[
38    b"pub",
39    b"export",
40    b"default",
41    b"async",
42    b"abstract",
43    b"unsafe",
44    b"static",
45    b"protected",
46    b"private",
47    b"public",
48];
49
50/// Definition keywords to detect.
51const DEF_KEYWORDS: &[&[u8]] = &[
52    b"struct",
53    b"fn",
54    b"enum",
55    b"trait",
56    b"impl",
57    b"class",
58    b"interface",
59    b"function",
60    b"def",
61    b"func",
62    b"type",
63    b"module",
64    b"object",
65];
66
67/// Skip zero or more modifier keywords (including `pub(crate)` style visibility).
68fn skip_modifiers(mut s: &[u8]) -> &[u8] {
69    loop {
70        // Handle `pub(...)` — e.g. `pub(crate)`, `pub(super)`
71        if s.starts_with(b"pub(")
72            && let Some(end) = s.iter().position(|&b| b == b')')
73        {
74            s = skip_ws(&s[end + 1..]);
75            continue;
76        }
77        let mut matched = false;
78        for &kw in MODIFIERS {
79            if s.starts_with(kw) {
80                let rest = &s[kw.len()..];
81                if rest.first().is_some_and(|b| b.is_ascii_whitespace()) {
82                    s = skip_ws(rest);
83                    matched = true;
84                    break;
85                }
86            }
87        }
88        if !matched {
89            return s;
90        }
91    }
92}
93
94/// Check if `s` starts with a definition keyword followed by a word boundary.
95fn is_definition_keyword(s: &[u8]) -> bool {
96    for &kw in DEF_KEYWORDS {
97        if s.starts_with(kw) {
98            let after = s.get(kw.len());
99            // Word boundary: end of input, or next byte is not alphanumeric/underscore
100            if after.is_none_or(|b| !b.is_ascii_alphanumeric() && *b != b'_') {
101                return true;
102            }
103        }
104    }
105    false
106}
107
108/// Skip ASCII whitespace.
109#[inline]
110fn skip_ws(s: &[u8]) -> &[u8] {
111    let n = s
112        .iter()
113        .position(|b| !b.is_ascii_whitespace())
114        .unwrap_or(s.len());
115    &s[n..]
116}
117
118/// Detect import/use lines — lower value than definitions or usages.
119///
120/// Checks if the line (after leading whitespace) starts with a common
121/// import statement prefix. Pure byte-level checks, no regex.
122pub fn is_import_line(line: &str) -> bool {
123    let s = line.trim_start().as_bytes();
124    s.starts_with(b"import ")
125        || s.starts_with(b"import\t")
126        || (s.starts_with(b"from ") && s.get(5).is_some_and(|&b| b == b'\'' || b == b'"'))
127        || s.starts_with(b"use ")
128        || s.starts_with(b"use\t")
129        || starts_with_require(s)
130        || starts_with_include(s)
131}
132
133/// Match `require(` or `require (`.
134#[inline]
135fn starts_with_require(s: &[u8]) -> bool {
136    if !s.starts_with(b"require") {
137        return false;
138    }
139    let rest = &s[b"require".len()..];
140    rest.first() == Some(&b'(') || (rest.first() == Some(&b' ') && rest.get(1) == Some(&b'('))
141}
142
143/// Match `# include ` (with optional spaces after `#`).
144#[inline]
145fn starts_with_include(s: &[u8]) -> bool {
146    if s.first() != Some(&b'#') {
147        return false;
148    }
149    let rest = skip_ws(&s[1..]);
150    rest.starts_with(b"include ") || rest.starts_with(b"include\t")
151}
152
153/// Determine whether `text` contains any regex metacharacters.
154///
155/// Uses `regex::escape` from the regex crate as the source of truth — if the
156/// escaped form differs from the original, the text contains characters that
157/// would be interpreted as regex syntax. This is deterministic and always in
158/// sync with the regex engine (no hand-rolled heuristic to maintain).
159///
160/// Callers can use this to choose between `GrepMode::Regex` and
161/// `GrepMode::PlainText`. When `Regex` mode is chosen and the pattern turns
162/// out to be invalid, `grep_search` already falls back to plain-text matching
163/// and populates `regex_fallback_error`.
164pub fn has_regex_metacharacters(text: &str) -> bool {
165    regex::escape(text) != text
166}
167
168/// Check if `text` contains `\n` that is NOT preceded by another `\`.
169///
170/// `\n` → true (user wants multiline search)
171/// `\\n` → false (escaped backslash followed by literal `n`, e.g. `\\nvim-data`)
172#[inline]
173fn has_unescaped_newline_escape(text: &str) -> bool {
174    let bytes = text.as_bytes();
175    let mut i = 0;
176    while i < bytes.len().saturating_sub(1) {
177        if bytes[i] == b'\\' {
178            if bytes[i + 1] == b'n' {
179                // Count consecutive backslashes ending at position i
180                let mut backslash_count = 1;
181                while backslash_count <= i && bytes[i - backslash_count] == b'\\' {
182                    backslash_count += 1;
183                }
184                // Odd number of backslashes before 'n' → real \n escape
185                if backslash_count % 2 == 1 {
186                    return true;
187                }
188            }
189            // Skip past the escaped character
190            i += 2;
191        } else {
192            i += 1;
193        }
194    }
195    false
196}
197
198/// Replace only unescaped `\n` sequences with real newlines.
199///
200/// `\n` → newline character
201/// `\\n` → preserved as-is (literal backslash + `n`)
202fn replace_unescaped_newline_escapes(text: &str) -> String {
203    let bytes = text.as_bytes();
204    let mut result = Vec::with_capacity(bytes.len());
205    let mut i = 0;
206    while i < bytes.len() {
207        if bytes[i] == b'\\' && i + 1 < bytes.len() {
208            if bytes[i + 1] == b'n' {
209                let mut backslash_count = 1;
210                while backslash_count <= i && bytes[i - backslash_count] == b'\\' {
211                    backslash_count += 1;
212                }
213                if backslash_count % 2 == 1 {
214                    result.push(b'\n');
215                    i += 2;
216                    continue;
217                }
218            }
219            result.push(bytes[i]);
220            i += 1;
221        } else {
222            result.push(bytes[i]);
223            i += 1;
224        }
225    }
226    String::from_utf8(result).unwrap_or_else(|_| text.to_string())
227}
228
229/// Controls how the grep pattern is interpreted.
230#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
231pub enum GrepMode {
232    /// Default mode: the query is treated as literal text.
233    /// The pattern is searched using SIMD-accelerated `memchr::memmem`.
234    /// Special regex characters in the query have no special meaning.
235    #[default]
236    PlainText,
237    /// Regex mode: the query is treated as a regular expression.
238    /// Uses the same `grep-matcher` / `regex::bytes::Regex` engine.
239    /// Invalid regex patterns will return zero results (not an error).
240    Regex,
241    /// Fuzzy mode: the query is treated as a fuzzy needle matched against
242    /// each line using neo_frizbee's Smith-Waterman scoring. Lines are ranked
243    /// by match score. Individual matched character positions are reported
244    /// as highlight ranges.
245    Fuzzy,
246}
247
248/// A single content match within a file.
249#[derive(Debug, Clone)]
250pub struct GrepMatch {
251    /// Index into the deduplicated `files` vec of the GrepResult.
252    pub file_index: usize,
253    /// 1-based line number.
254    pub line_number: u64,
255    /// 0-based byte column of first match start within the line.
256    pub col: usize,
257    /// Absolute byte offset of the matched line from the start of the file.
258    /// Can be used by the preview to seek directly without scanning from the top.
259    pub byte_offset: u64,
260    /// The matched line text, truncated to `MAX_LINE_DISPLAY_LEN`.
261    pub line_content: String,
262    /// Byte offsets `(start, end)` within `line_content` for each match.
263    /// Stack-allocated for the common case of ≤4 spans per line.
264    pub match_byte_offsets: SmallVec<[(u32, u32); 4]>,
265    /// Fuzzy match score from neo_frizbee (only set in Fuzzy grep mode).
266    pub fuzzy_score: Option<u16>,
267    /// Whether the matched line looks like a definition (struct, fn, class, etc.).
268    /// Computed at match time so output formatters don't need to re-scan.
269    pub is_definition: bool,
270    /// Lines before the match (for context display). Empty when context is 0.
271    pub context_before: Vec<String>,
272    /// Lines after the match (for context display). Empty when context is 0.
273    pub context_after: Vec<String>,
274}
275
276/// Result of a grep search.
277#[derive(Debug, Clone, Default)]
278pub struct GrepResult<'a> {
279    pub matches: Vec<GrepMatch>,
280    /// Deduplicated file references for the returned matches.
281    pub files: Vec<&'a FileItem>,
282    /// Number of files actually searched in this call.
283    pub total_files_searched: usize,
284    /// Total number of indexed files (before filtering).
285    pub total_files: usize,
286    /// Total number of searchable files (after filtering out binary, too-large, etc.).
287    pub filtered_file_count: usize,
288    /// The file offset to pass for the next page. `0` if there are no more files.
289    /// Callers should store this and pass it as `file_offset` in the next call.
290    pub next_file_offset: usize,
291    /// When regex mode fails to compile the pattern, the search falls back to
292    /// literal matching and this field contains the compilation error message.
293    /// The UI can display this to inform the user their regex was invalid.
294    pub regex_fallback_error: Option<String>,
295}
296
297/// Options for grep search.
298#[derive(Debug, Clone)]
299pub struct GrepSearchOptions {
300    pub max_file_size: u64,
301    pub max_matches_per_file: usize,
302    pub smart_case: bool,
303    /// File-based pagination offset: index into the sorted/filtered file list
304    /// to start searching from. Pass 0 for the first page, then use
305    /// `GrepResult::next_file_offset` for subsequent pages.
306    pub file_offset: usize,
307    /// Maximum number of matches to collect before stopping.
308    pub page_limit: usize,
309    /// How to interpret the search pattern. Defaults to `PlainText`.
310    pub mode: GrepMode,
311    /// Maximum time in milliseconds to spend searching before returning partial
312    /// results. Prevents UI freezes on pathological queries. 0 = no limit.
313    pub time_budget_ms: u64,
314    /// Number of context lines to include before each match. 0 = disabled.
315    pub before_context: usize,
316    /// Number of context lines to include after each match. 0 = disabled.
317    pub after_context: usize,
318    /// Whether to classify each match as a definition line. Adds ~2% overhead
319    /// on large repos; disable for interactive grep where it is not needed.
320    pub classify_definitions: bool,
321}
322
323/// Lightweight wrapper around `regex::bytes::Regex` implementing the
324/// `grep_matcher::Matcher` trait required by `grep-searcher`.
325///
326/// When `is_multiline` is false (the common case), we report `\n` as the
327/// line terminator. This enables the **fast** search path in `fff-searcher`:
328/// instead of calling `shortest_match()` on every single line (slow path),
329/// the searcher calls `find_candidate_line()` once on the entire remaining
330/// buffer, letting the regex DFA skip non-matching content in a single pass.
331///
332/// For multiline patterns we must NOT report a line terminator — the regex
333/// can match across line boundaries, so the searcher needs the `MultiLine`
334/// strategy.
335struct RegexMatcher<'r> {
336    regex: &'r regex::bytes::Regex,
337    is_multiline: bool,
338}
339
340impl Matcher for RegexMatcher<'_> {
341    type Captures = NoCaptures;
342    type Error = NoError;
343
344    #[inline]
345    fn find_at(&self, haystack: &[u8], at: usize) -> Result<Option<Match>, NoError> {
346        Ok(self
347            .regex
348            .find_at(haystack, at)
349            .map(|m| Match::new(m.start(), m.end())))
350    }
351
352    #[inline]
353    fn new_captures(&self) -> Result<NoCaptures, NoError> {
354        Ok(NoCaptures::new())
355    }
356
357    #[inline]
358    fn line_terminator(&self) -> Option<grep_matcher::LineTerminator> {
359        if self.is_multiline {
360            None
361        } else {
362            Some(grep_matcher::LineTerminator::byte(b'\n'))
363        }
364    }
365}
366
367/// A `grep_matcher::Matcher` backed by `memchr::memmem` for literal search.
368///
369/// This is used in `PlainText` mode and is significantly faster than regex
370/// for literal patterns: memchr uses SIMD (AVX2/NEON) two-way substring
371/// search internally, avoiding the overhead of regex compilation and DFA
372/// state transitions.
373///
374/// Always reports `\n` as line terminator so the searcher uses the fast
375/// candidate-line path (plain text can never span lines unless `\n` is
376/// literally in the needle, which we handle separately).
377struct PlainTextMatcher<'a> {
378    /// Case-folded needle bytes for case-insensitive matching.
379    /// When case-sensitive, this is the original pattern bytes.
380    needle: &'a [u8],
381    case_insensitive: bool,
382}
383
384impl Matcher for PlainTextMatcher<'_> {
385    type Captures = NoCaptures;
386    type Error = NoError;
387
388    #[inline]
389    fn find_at(&self, haystack: &[u8], at: usize) -> Result<Option<Match>, NoError> {
390        let hay = &haystack[at..];
391
392        let found = if self.case_insensitive {
393            // ASCII case-insensitive: lowercase the haystack slice on the fly.
394            // We scan with a rolling window to avoid allocating a full copy.
395            ascii_case_insensitive_find(hay, self.needle)
396        } else {
397            memchr::memmem::find(hay, self.needle)
398        };
399
400        Ok(found.map(|pos| Match::new(at + pos, at + pos + self.needle.len())))
401    }
402
403    #[inline]
404    fn new_captures(&self) -> Result<NoCaptures, NoError> {
405        Ok(NoCaptures::new())
406    }
407
408    #[inline]
409    fn line_terminator(&self) -> Option<grep_matcher::LineTerminator> {
410        Some(grep_matcher::LineTerminator::byte(b'\n'))
411    }
412}
413
414/// ASCII case-insensitive substring search.
415///
416/// Uses a SIMD-accelerated two-byte scan (first + last byte of needle) via
417/// `memchr2_iter`, then verifies candidates with a fast byte comparison that
418/// leverages the fact that ASCII case differs only in bit 0x20.
419#[inline]
420fn ascii_case_insensitive_find(haystack: &[u8], needle_lower: &[u8]) -> Option<usize> {
421    let nlen = needle_lower.len();
422    if nlen == 0 {
423        return Some(0);
424    }
425
426    if haystack.len() < nlen {
427        return None;
428    }
429
430    let first_lo = needle_lower[0];
431    let first_hi = first_lo.to_ascii_uppercase();
432
433    // Single-byte needle: just find either case variant.
434    if nlen == 1 {
435        return memchr::memchr2(first_lo, first_hi, haystack);
436    }
437
438    let tail = &needle_lower[1..];
439    let end = haystack.len() - nlen;
440
441    // Scan for candidates where the first byte matches (either case).
442    for pos in memchr::memchr2_iter(first_lo, first_hi, &haystack[..=end]) {
443        // Verify the remaining bytes with bitwise ASCII case-insensitive compare.
444        // For ASCII letters, (a ^ b) & ~0x20 == 0 when they match ignoring case.
445        // For non-letters, exact equality is required; OR-ing with 0x20 maps both
446        // cases to lowercase and is correct for non-alpha bytes that are already equal.
447        let candidate = unsafe { haystack.get_unchecked(pos + 1..pos + nlen) };
448        if ascii_case_eq(candidate, tail) {
449            return Some(pos);
450        }
451    }
452    None
453}
454
455/// Fast ASCII case-insensitive byte slice comparison.
456///
457/// Returns true if `a` and `b` are equal when compared case-insensitively
458/// for ASCII bytes. Both slices must have the same length.
459#[inline]
460fn ascii_case_eq(a: &[u8], b: &[u8]) -> bool {
461    debug_assert_eq!(a.len(), b.len());
462    // Process 8 bytes at a time using u64 bitwise operations.
463    // For each byte: (x | 0x20) maps uppercase ASCII to lowercase.
464    // This is correct for letters. For non-letter bytes where the original
465    // values are equal, OR-ing with 0x20 preserves equality. For non-letter
466    // bytes where values differ, this can produce false positives only when
467    // they differ exactly by 0x20 — we do a fast exact-match check first
468    // to catch those rare cases.
469    let len = a.len();
470    let mut i = 0;
471
472    // Fast path: compare 8 bytes at a time
473    while i + 8 <= len {
474        let va = u64::from_ne_bytes(unsafe { *(a.as_ptr().add(i) as *const [u8; 8]) });
475        let vb = u64::from_ne_bytes(unsafe { *(b.as_ptr().add(i) as *const [u8; 8]) });
476
477        // Quick exact-match shortcut (common for non-alpha content)
478        if va != vb {
479            // Case-insensitive: OR each byte with 0x20 to fold case
480            const MASK: u64 = 0x2020_2020_2020_2020;
481            if (va | MASK) != (vb | MASK) {
482                return false;
483            }
484        }
485        i += 8;
486    }
487
488    // Handle remaining bytes
489    while i < len {
490        let ha = unsafe { *a.get_unchecked(i) };
491        let hb = unsafe { *b.get_unchecked(i) };
492        if ha != hb && (ha | 0x20) != (hb | 0x20) {
493            return false;
494        }
495        i += 1;
496    }
497
498    true
499}
500
501/// Maximum bytes of a matched line to keep for display. Prevents minified
502/// JS or huge single-line files from blowing up memory.
503const MAX_LINE_DISPLAY_LEN: usize = 512;
504
505struct SinkState {
506    file_index: usize,
507    matches: Vec<GrepMatch>,
508    max_matches: usize,
509    before_context: usize,
510    after_context: usize,
511    classify_definitions: bool,
512}
513
514impl SinkState {
515    #[inline]
516    fn prepare_line<'a>(line_bytes: &'a [u8], mat: &SinkMatch<'_>) -> (&'a [u8], u32, u64, u64) {
517        let line_number = mat.line_number().unwrap_or(0);
518        let byte_offset = mat.absolute_byte_offset();
519
520        // Trim trailing newline/CR directly on bytes to avoid UTF-8 conversion.
521        let trimmed_len = {
522            let mut len = line_bytes.len();
523            while len > 0 && matches!(line_bytes[len - 1], b'\n' | b'\r') {
524                len -= 1;
525            }
526            len
527        };
528        let trimmed_bytes = &line_bytes[..trimmed_len];
529
530        // Truncate for display (floor to a char boundary).
531        let display_bytes = truncate_display_bytes(trimmed_bytes);
532
533        let display_len = display_bytes.len() as u32;
534        (display_bytes, display_len, line_number, byte_offset)
535    }
536
537    #[inline]
538    #[allow(clippy::too_many_arguments)]
539    fn push_match(
540        &mut self,
541        line_number: u64,
542        col: usize,
543        byte_offset: u64,
544        line_content: String,
545        match_byte_offsets: SmallVec<[(u32, u32); 4]>,
546        context_before: Vec<String>,
547        context_after: Vec<String>,
548    ) {
549        let is_definition = self.classify_definitions && is_definition_line(&line_content);
550        self.matches.push(GrepMatch {
551            file_index: self.file_index,
552            line_number,
553            col,
554            byte_offset,
555            line_content,
556            match_byte_offsets,
557            fuzzy_score: None,
558            is_definition,
559            context_before,
560            context_after,
561        });
562    }
563
564    /// Extract context lines from the full buffer around a matched region.
565    fn extract_context(&self, mat: &SinkMatch<'_>) -> (Vec<String>, Vec<String>) {
566        if self.before_context == 0 && self.after_context == 0 {
567            return (Vec::new(), Vec::new());
568        }
569
570        let buffer = mat.buffer();
571        let range = mat.bytes_range_in_buffer();
572
573        let mut before = Vec::new();
574        if self.before_context > 0 && range.start > 0 {
575            // Walk backward from the start of the match line to find preceding lines
576            let mut pos = range.start;
577            let mut lines_found = 0;
578            while lines_found < self.before_context && pos > 0 {
579                // Skip the newline just before our current position
580                pos -= 1;
581                // Find the previous newline
582                let line_start = match memchr::memrchr(b'\n', &buffer[..pos]) {
583                    Some(nl) => nl + 1,
584                    None => 0,
585                };
586                let line = &buffer[line_start..pos];
587                // Trim trailing \r
588                let line = if line.last() == Some(&b'\r') {
589                    &line[..line.len() - 1]
590                } else {
591                    line
592                };
593                let truncated = truncate_display_bytes(line);
594                before.push(String::from_utf8_lossy(truncated).into_owned());
595                pos = line_start;
596                lines_found += 1;
597            }
598            before.reverse();
599        }
600
601        let mut after = Vec::new();
602        if self.after_context > 0 && range.end < buffer.len() {
603            let mut pos = range.end;
604            let mut lines_found = 0;
605            while lines_found < self.after_context && pos < buffer.len() {
606                // Find the next newline
607                let line_end = match memchr::memchr(b'\n', &buffer[pos..]) {
608                    Some(nl) => pos + nl,
609                    None => buffer.len(),
610                };
611                let line = &buffer[pos..line_end];
612                // Trim trailing \r
613                let line = if line.last() == Some(&b'\r') {
614                    &line[..line.len() - 1]
615                } else {
616                    line
617                };
618                let truncated = truncate_display_bytes(line);
619                after.push(String::from_utf8_lossy(truncated).into_owned());
620                pos = if line_end < buffer.len() {
621                    line_end + 1 // skip past \n
622                } else {
623                    buffer.len()
624                };
625                lines_found += 1;
626            }
627        }
628
629        (before, after)
630    }
631}
632
633/// Truncate a byte slice for display, respecting UTF-8 char boundaries.
634#[inline]
635fn truncate_display_bytes(bytes: &[u8]) -> &[u8] {
636    if bytes.len() <= MAX_LINE_DISPLAY_LEN {
637        bytes
638    } else {
639        let mut end = MAX_LINE_DISPLAY_LEN;
640        while end > 0 && !is_utf8_char_boundary(bytes[end]) {
641            end -= 1;
642        }
643        &bytes[..end]
644    }
645}
646
647/// Sink for `PlainText` mode.
648///
649/// Highlights are extracted with SIMD-accelerated `memchr::memmem::Finder`.
650/// Case-insensitive matching lowercases the line into a stack buffer before
651/// searching, keeping positions 1:1 for ASCII.
652/// No regex engine is involved at any point.
653struct PlainTextSink<'r> {
654    state: SinkState,
655    finder: &'r memchr::memmem::Finder<'r>,
656    pattern_len: u32,
657    case_insensitive: bool,
658}
659
660impl Sink for PlainTextSink<'_> {
661    type Error = std::io::Error;
662
663    fn matched(&mut self, _searcher: &Searcher, mat: &SinkMatch<'_>) -> Result<bool, Self::Error> {
664        if self.state.max_matches != 0 && self.state.matches.len() >= self.state.max_matches {
665            return Ok(false);
666        }
667
668        let line_bytes = mat.bytes();
669        let (display_bytes, display_len, line_number, byte_offset) =
670            SinkState::prepare_line(line_bytes, mat);
671
672        let line_content = String::from_utf8_lossy(display_bytes).into_owned();
673        let mut match_byte_offsets: SmallVec<[(u32, u32); 4]> = SmallVec::new();
674        let mut col = 0usize;
675        let mut first = true;
676
677        if self.case_insensitive {
678            // Lowercase the display bytes into a stack buffer; positions are 1:1
679            // for ASCII so no mapping is needed.
680            let mut lowered = [0u8; MAX_LINE_DISPLAY_LEN];
681            let len = display_bytes.len().min(MAX_LINE_DISPLAY_LEN);
682            for (dst, &src) in lowered[..len].iter_mut().zip(display_bytes) {
683                *dst = src.to_ascii_lowercase();
684            }
685
686            let mut start_pos = 0usize;
687            while let Some(pos) = self.finder.find(&lowered[start_pos..len]) {
688                let abs_start = (start_pos + pos) as u32;
689                let abs_end = (abs_start + self.pattern_len).min(display_len);
690                if first {
691                    col = abs_start as usize;
692                    first = false;
693                }
694                match_byte_offsets.push((abs_start, abs_end));
695                start_pos += pos + 1;
696            }
697        } else {
698            let mut start_pos = 0usize;
699            while let Some(pos) = self.finder.find(&display_bytes[start_pos..]) {
700                let abs_start = (start_pos + pos) as u32;
701                let abs_end = (abs_start + self.pattern_len).min(display_len);
702                if first {
703                    col = abs_start as usize;
704                    first = false;
705                }
706                match_byte_offsets.push((abs_start, abs_end));
707                start_pos += pos + 1;
708            }
709        }
710
711        let (context_before, context_after) = self.state.extract_context(mat);
712        self.state.push_match(
713            line_number,
714            col,
715            byte_offset,
716            line_content,
717            match_byte_offsets,
718            context_before,
719            context_after,
720        );
721        Ok(true)
722    }
723
724    fn finish(&mut self, _: &Searcher, _: &fff_grep::SinkFinish) -> Result<(), Self::Error> {
725        Ok(())
726    }
727}
728
729/// Sink for `Regex` mode.
730///
731/// Uses the compiled regex to extract precise variable-length highlight spans
732/// from each matched line. No `memmem` finder is involved.
733struct RegexSink<'r> {
734    state: SinkState,
735    re: &'r regex::bytes::Regex,
736}
737
738impl Sink for RegexSink<'_> {
739    type Error = std::io::Error;
740
741    fn matched(
742        &mut self,
743        _searcher: &Searcher,
744        sink_match: &SinkMatch<'_>,
745    ) -> Result<bool, Self::Error> {
746        if self.state.max_matches != 0 && self.state.matches.len() >= self.state.max_matches {
747            return Ok(false);
748        }
749
750        let line_bytes = sink_match.bytes();
751        let (display_bytes, display_len, line_number, byte_offset) =
752            SinkState::prepare_line(line_bytes, sink_match);
753
754        let line_content = String::from_utf8_lossy(display_bytes).into_owned();
755        let mut match_byte_offsets: SmallVec<[(u32, u32); 4]> = SmallVec::new();
756        let mut col = 0usize;
757        let mut first = true;
758
759        for m in self.re.find_iter(display_bytes) {
760            let abs_start = m.start() as u32;
761            let abs_end = (m.end() as u32).min(display_len);
762            if first {
763                col = abs_start as usize;
764                first = false;
765            }
766            match_byte_offsets.push((abs_start, abs_end));
767        }
768
769        let (context_before, context_after) = self.state.extract_context(sink_match);
770        self.state.push_match(
771            line_number,
772            col,
773            byte_offset,
774            line_content,
775            match_byte_offsets,
776            context_before,
777            context_after,
778        );
779        Ok(true)
780    }
781
782    fn finish(&mut self, _: &Searcher, _: &fff_grep::SinkFinish) -> Result<(), Self::Error> {
783        Ok(())
784    }
785}
786
787/// A `grep_matcher::Matcher` backed by Aho-Corasick for multi-pattern search.
788///
789/// Finds the first occurrence of any pattern starting at the given offset.
790/// Always reports `\n` as the line terminator for the fast candidate-line path.
791struct AhoCorasickMatcher<'a> {
792    ac: &'a AhoCorasick,
793}
794
795impl Matcher for AhoCorasickMatcher<'_> {
796    type Captures = NoCaptures;
797    type Error = NoError;
798
799    #[inline]
800    fn find_at(&self, haystack: &[u8], at: usize) -> std::result::Result<Option<Match>, NoError> {
801        let hay = &haystack[at..];
802        let found: Option<aho_corasick::Match> = self.ac.find(hay);
803        Ok(found.map(|m| Match::new(at + m.start(), at + m.end())))
804    }
805
806    #[inline]
807    fn new_captures(&self) -> Result<NoCaptures, NoError> {
808        Ok(NoCaptures::new())
809    }
810
811    #[inline]
812    fn line_terminator(&self) -> Option<grep_matcher::LineTerminator> {
813        Some(grep_matcher::LineTerminator::byte(b'\n'))
814    }
815}
816
817/// Sink for Aho-Corasick multi-pattern mode.
818///
819/// Collects all pattern match positions on each matched line for highlighting.
820struct AhoCorasickSink<'a> {
821    state: SinkState,
822    ac: &'a AhoCorasick,
823}
824
825impl Sink for AhoCorasickSink<'_> {
826    type Error = std::io::Error;
827
828    fn matched(&mut self, _searcher: &Searcher, mat: &SinkMatch<'_>) -> Result<bool, Self::Error> {
829        if self.state.max_matches != 0 && self.state.matches.len() >= self.state.max_matches {
830            return Ok(false);
831        }
832
833        let line_bytes = mat.bytes();
834        let (display_bytes, display_len, line_number, byte_offset) =
835            SinkState::prepare_line(line_bytes, mat);
836
837        let line_content = String::from_utf8_lossy(display_bytes).into_owned();
838        let mut match_byte_offsets: SmallVec<[(u32, u32); 4]> = SmallVec::new();
839        let mut col = 0usize;
840        let mut first = true;
841
842        for m in self.ac.find_iter(display_bytes as &[u8]) {
843            let abs_start = m.start() as u32;
844            let abs_end = (m.end() as u32).min(display_len);
845            if first {
846                col = abs_start as usize;
847                first = false;
848            }
849            match_byte_offsets.push((abs_start, abs_end));
850        }
851
852        let (context_before, context_after) = self.state.extract_context(mat);
853        self.state.push_match(
854            line_number,
855            col,
856            byte_offset,
857            line_content,
858            match_byte_offsets,
859            context_before,
860            context_after,
861        );
862        Ok(true)
863    }
864
865    fn finish(&mut self, _: &Searcher, _: &fff_grep::SinkFinish) -> Result<(), Self::Error> {
866        Ok(())
867    }
868}
869
870/// Multi-pattern OR search using Aho-Corasick.
871///
872/// Builds a single automaton from all patterns and searches each file in one
873/// pass. This is significantly faster than regex alternation for literal text
874/// searches because Aho-Corasick uses SIMD-accelerated multi-needle matching.
875///
876/// Returns the same `GrepResult` type as `grep_search`.
877pub fn multi_grep_search<'a>(
878    files: &'a [FileItem],
879    patterns: &[&str],
880    constraints: &[fff_query_parser::Constraint<'_>],
881    options: &GrepSearchOptions,
882) -> GrepResult<'a> {
883    let total_files = files.len();
884
885    if patterns.is_empty() || patterns.iter().all(|p| p.is_empty()) {
886        return GrepResult {
887            total_files,
888            filtered_file_count: total_files,
889            ..Default::default()
890        };
891    }
892
893    let (mut files_to_search, mut filtered_file_count) =
894        prepare_files_to_search(files, constraints, options);
895
896    // If constraints yielded 0 files and we had FilePath constraints,
897    // retry without them (the path token was likely part of the search text).
898    if files_to_search.is_empty()
899        && let Some(stripped) = strip_file_path_constraints(constraints)
900    {
901        let (retry_files, retry_count) = prepare_files_to_search(files, &stripped, options);
902        files_to_search = retry_files;
903        filtered_file_count = retry_count;
904    }
905
906    if files_to_search.is_empty() {
907        return GrepResult {
908            total_files,
909            filtered_file_count,
910            ..Default::default()
911        };
912    }
913
914    // Smart case: case-insensitive when all patterns are lowercase
915    let case_insensitive = if options.smart_case {
916        !patterns.iter().any(|p| p.chars().any(|c| c.is_uppercase()))
917    } else {
918        false
919    };
920
921    let ac = aho_corasick::AhoCorasickBuilder::new()
922        .ascii_case_insensitive(case_insensitive)
923        .build(patterns)
924        .expect("Aho-Corasick build should not fail for literal patterns");
925
926    let searcher = {
927        let mut b = SearcherBuilder::new();
928        b.line_number(true);
929        b
930    }
931    .build();
932
933    let ac_matcher = AhoCorasickMatcher { ac: &ac };
934    run_file_search(
935        &files_to_search,
936        options,
937        total_files,
938        filtered_file_count,
939        None,
940        |file_bytes: &[u8], max_matches: usize| {
941            let state = SinkState {
942                file_index: 0,
943                matches: Vec::with_capacity(4),
944                max_matches,
945                before_context: options.before_context,
946                after_context: options.after_context,
947                classify_definitions: options.classify_definitions,
948            };
949
950            let mut sink = AhoCorasickSink { state, ac: &ac };
951
952            if let Err(e) = searcher.search_slice(&ac_matcher, file_bytes, &mut sink) {
953                tracing::error!(error = %e, "Grep (aho-corasick multi) search failed");
954            }
955
956            sink.state.matches
957        },
958    )
959}
960
961// copied from the rust u8 private method
962#[inline]
963const fn is_utf8_char_boundary(b: u8) -> bool {
964    (b as i8) >= -0x40
965}
966
967/// Build a regex from the user's grep text.
968///
969/// In `PlainText` mode:
970/// - Escapes the input for literal matching (users type text, not regex)
971/// - Applies smart case: case-insensitive unless query has uppercase
972/// - Detects `\n` for multiline
973///
974/// In `Regex` mode:
975/// - The input is passed directly to the regex engine without escaping
976/// - Smart case still applies
977/// - Returns `None` for invalid regex patterns — the caller falls back to literal mode
978fn build_regex(pattern: &str, smart_case: bool) -> Result<regex::bytes::Regex, String> {
979    if pattern.is_empty() {
980        return Err("empty pattern".to_string());
981    }
982
983    let regex_pattern = if pattern.contains("\\n") {
984        pattern.replace("\\n", "\n")
985    } else {
986        pattern.to_string()
987    };
988
989    let case_insensitive = if smart_case {
990        !pattern.chars().any(|c| c.is_uppercase())
991    } else {
992        false
993    };
994
995    regex::bytes::RegexBuilder::new(&regex_pattern)
996        .case_insensitive(case_insensitive)
997        .multi_line(true)
998        .unicode(false)
999        .build()
1000        .map_err(|e| e.to_string())
1001}
1002
1003/// Convert character-position indices from neo_frizbee into byte-offset
1004/// pairs (start, end) suitable for `match_byte_offsets`.
1005///
1006/// frizbee returns character positions (0-based index into the char
1007/// iterator). We need byte ranges because the UI renderer and Lua layer
1008/// use byte offsets for extmark highlights.
1009///
1010/// Each matched character becomes its own (byte_start, byte_end) pair.
1011/// Adjacent characters are merged into a single contiguous range.
1012fn char_indices_to_byte_offsets(line: &str, char_indices: &[usize]) -> SmallVec<[(u32, u32); 4]> {
1013    if char_indices.is_empty() {
1014        return SmallVec::new();
1015    }
1016
1017    // Build a map: char_index -> (byte_start, byte_end) for all chars.
1018    // Iterating all chars is O(n) in the line length which is bounded by MAX_LINE_DISPLAY_LEN (512).
1019    let char_byte_ranges: Vec<(usize, usize)> = line
1020        .char_indices()
1021        .map(|(byte_pos, ch)| (byte_pos, byte_pos + ch.len_utf8()))
1022        .collect();
1023
1024    // Convert char indices to byte ranges, merging adjacent ranges
1025    let mut result: SmallVec<[(u32, u32); 4]> = SmallVec::with_capacity(char_indices.len());
1026
1027    for &ci in char_indices {
1028        if ci >= char_byte_ranges.len() {
1029            continue; // out of bounds (shouldn't happen with valid data)
1030        }
1031        let (start, end) = char_byte_ranges[ci];
1032        // Merge with previous range if adjacent
1033        if let Some(last) = result.last_mut()
1034            && last.1 == start as u32
1035        {
1036            last.1 = end as u32;
1037            continue;
1038        }
1039        result.push((start as u32, end as u32));
1040    }
1041
1042    result
1043}
1044
1045#[tracing::instrument(skip_all, level = Level::DEBUG)]
1046fn run_file_search<'a, F>(
1047    files_to_search: &[&'a FileItem],
1048    options: &GrepSearchOptions,
1049    total_files: usize,
1050    filtered_file_count: usize,
1051    regex_fallback_error: Option<String>,
1052    search_file: F,
1053) -> GrepResult<'a>
1054where
1055    F: Fn(&[u8], usize) -> Vec<GrepMatch> + Sync,
1056{
1057    let time_budget = if options.time_budget_ms > 0 {
1058        Some(std::time::Duration::from_millis(options.time_budget_ms))
1059    } else {
1060        None
1061    };
1062
1063    let search_start = std::time::Instant::now();
1064    let budget_exceeded = AtomicBool::new(false);
1065
1066    // Parallel phase: search all files concurrently using rayon.
1067    // Every file is visited (no early-exit gaps), so per_file_results is a
1068    // contiguous, order-preserving subset — pagination offsets stay correct.
1069    // The time budget acts as the work bound; there is no separate file cap.
1070    let per_file_results: Vec<(usize, &'a FileItem, Vec<GrepMatch>)> = files_to_search
1071        .par_iter()
1072        .enumerate()
1073        .filter_map(|(idx, file)| {
1074            // Time budget check (relaxed — checked once per file, not per line).
1075            if let Some(budget) = time_budget
1076                && search_start.elapsed() > budget
1077            {
1078                budget_exceeded.store(true, Ordering::Relaxed);
1079                return None;
1080            }
1081
1082            let content = file.get_mmap()?;
1083            let file_matches = search_file(content, options.max_matches_per_file);
1084
1085            if file_matches.is_empty() {
1086                return None;
1087            }
1088
1089            Some((idx, *file, file_matches))
1090        })
1091        .collect();
1092
1093    collect_grep_results(
1094        per_file_results,
1095        files_to_search.len(),
1096        options,
1097        total_files,
1098        filtered_file_count,
1099        regex_fallback_error,
1100        budget_exceeded.load(Ordering::Relaxed),
1101    )
1102}
1103
1104/// Flatten per-file results into the final `GrepResult`.
1105///
1106/// Shared post-processing for both `run_file_search` (simple closure) and
1107/// `fuzzy_grep_search` (which uses `map_init` for per-thread matcher reuse).
1108fn collect_grep_results<'a>(
1109    per_file_results: Vec<(usize, &'a FileItem, Vec<GrepMatch>)>,
1110    files_to_search_len: usize,
1111    options: &GrepSearchOptions,
1112    total_files: usize,
1113    filtered_file_count: usize,
1114    regex_fallback_error: Option<String>,
1115    budget_exceeded: bool,
1116) -> GrepResult<'a> {
1117    let page_limit = options.page_limit;
1118
1119    // Each match stores a `file_index` pointing into `result_files` so that
1120    // consumers (FFI JSON, Lua) can look up file metadata without duplicating
1121    // it across every match from the same file.
1122    let mut result_files: Vec<&'a FileItem> = Vec::new();
1123    let mut all_matches: Vec<GrepMatch> = Vec::new();
1124    // files_consumed tracks how far into files_to_search we have advanced,
1125    // counting every file whose results were emitted (with or without matches).
1126    // We use the batch_idx of the last consumed file + 1, which is correct
1127    // because per_file_results only contains files that had matches, and
1128    // files between them that had no matches were still searched and can be
1129    // safely skipped on the next page.
1130    let mut files_consumed: usize = 0;
1131
1132    for (batch_idx, file, file_matches) in per_file_results {
1133        // batch_idx is the 0-based position in files_to_search.
1134        // Advance files_consumed to include this file and all no-match files before it.
1135        files_consumed = batch_idx + 1;
1136
1137        let file_result_idx = result_files.len();
1138        result_files.push(file);
1139
1140        for mut m in file_matches {
1141            m.file_index = file_result_idx;
1142            all_matches.push(m);
1143        }
1144
1145        // page_limit is a soft cap: we always finish the current file before
1146        // stopping, so no matches are dropped. A page may return up to
1147        // page_limit + max_matches_per_file - 1 matches in the worst case.
1148        if all_matches.len() >= page_limit {
1149            break;
1150        }
1151    }
1152
1153    // If no file had any match, we searched the entire slice.
1154    if result_files.is_empty() {
1155        files_consumed = files_to_search_len;
1156    }
1157
1158    let has_more = budget_exceeded
1159        || (all_matches.len() >= page_limit && files_consumed < files_to_search_len);
1160
1161    let next_file_offset = if has_more {
1162        options.file_offset + files_consumed
1163    } else {
1164        0
1165    };
1166
1167    GrepResult {
1168        matches: all_matches,
1169        files: result_files,
1170        total_files_searched: files_consumed,
1171        total_files,
1172        filtered_file_count,
1173        next_file_offset,
1174        regex_fallback_error,
1175    }
1176}
1177
1178/// Filter files by constraints and size/binary checks, sort by frecency,
1179/// and apply file-based pagination.
1180///
1181/// Returns `(paginated_files, filtered_file_count)`. The paginated slice
1182/// is empty if the offset is past the end of available files.
1183fn prepare_files_to_search<'a>(
1184    files: &'a [FileItem],
1185    constraints: &[fff_query_parser::Constraint<'_>],
1186    options: &GrepSearchOptions,
1187) -> (Vec<&'a FileItem>, usize) {
1188    let prefiltered: Vec<&FileItem> = if constraints.is_empty() {
1189        files
1190            .iter()
1191            .filter(|f| !f.is_binary && f.size > 0 && f.size <= options.max_file_size)
1192            .collect()
1193    } else {
1194        match apply_constraints(files, constraints) {
1195            Some(constrained) => constrained
1196                .into_iter()
1197                .filter(|f| !f.is_binary && f.size > 0 && f.size <= options.max_file_size)
1198                .collect(),
1199            None => files
1200                .iter()
1201                .filter(|f| !f.is_binary && f.size > 0 && f.size <= options.max_file_size)
1202                .collect(),
1203        }
1204    };
1205
1206    let total_count = prefiltered.len();
1207
1208    // Sort by frecency (files are stored by path, not frecency)
1209    let mut sorted_files = prefiltered;
1210    sort_with_buffer(&mut sorted_files, |a, b| {
1211        b.total_frecency_score
1212            .cmp(&a.total_frecency_score)
1213            .then(b.modified.cmp(&a.modified))
1214    });
1215
1216    if options.file_offset < total_count {
1217        let sorted_files = sorted_files.split_off(options.file_offset);
1218        (sorted_files, total_count)
1219    } else {
1220        (Vec::new(), total_count)
1221    }
1222}
1223
1224/// Fuzzy grep search using SIMD-accelerated `neo_frizbee::match_list`.
1225///
1226/// # Why this doesn't use `grep-searcher` / `GrepSink`
1227///
1228/// PlainText and Regex modes use the `grep-searcher` pipeline: a `Matcher`
1229/// finds candidate lines, and a `Sink` collects them one at a time. This
1230/// works well because memchr/regex can *skip* non-matching lines in O(n)
1231/// without scoring every one.
1232///
1233/// Fuzzy matching is fundamentally different. Every line is a candidate —
1234/// the Smith-Waterman score determines whether it passes, not a substring
1235/// or pattern test. The `Matcher::find_at` trait forces per-line calls to
1236/// the *reference* (scalar) smith-waterman, which is O(needle × line_len)
1237/// per line. For a 10k-line file that's 10k sequential reference calls.
1238///
1239/// `neo_frizbee::match_list` solves this by batching lines into
1240/// fixed-width SIMD buckets (4, 8, 12 … 512 bytes) and scoring 16+
1241/// haystacks per SIMD invocation. A single `match_list` call over the
1242/// entire file replaces 10k individual `match_indices` calls. We then
1243/// call `match_indices` *only* on the ~5-20 lines that pass `min_score`
1244/// to extract character highlight positions.
1245///
1246/// Line splitting uses `memchr::memchr` (the same SIMD-accelerated byte
1247/// search that `grep-searcher` and `bstr::ByteSlice::find_byte` use
1248/// internally) to locate `\n` terminators. This gives us the same
1249/// performance as the searcher's `LineStep` iterator without pulling in
1250/// the full searcher machinery.
1251///
1252/// For each file:
1253///   1. mmap the file, split lines via memchr '\n' (tracking line numbers + byte offsets)
1254///   2. Batch all lines through `match_list` (SIMD smith-waterman)
1255///   3. Filter results by `min_score`
1256///   4. Call `match_indices` only on passing lines to get character highlight offsets
1257fn fuzzy_grep_search<'a>(
1258    grep_text: &str,
1259    files_to_search: &[&'a FileItem],
1260    options: &GrepSearchOptions,
1261    total_files: usize,
1262    filtered_file_count: usize,
1263    case_insensitive: bool,
1264) -> GrepResult<'a> {
1265    // max_typos controls how many *needle* characters can be unmatched.
1266    // A transposition (e.g. "shcema" → "schema") costs ~1 typo with
1267    // default gap penalties. We scale max_typos by needle length:
1268    //   1-2 chars → 0 typos (exact subsequence only)
1269    //   3-5 chars → 1 typo
1270    //   6+  chars → 2 typos
1271    let max_typos = (grep_text.len() / 3).min(2);
1272    let scoring = neo_frizbee::Scoring {
1273        // Use default gap penalties. Higher values (e.g. 20) cause
1274        // smith-waterman to prefer *dropping needle chars* over paying
1275        // gap costs, which inflates the typo count and breaks
1276        // transposition matching ("shcema" → "schema" becomes 3 typos
1277        // instead of 1). Scattered matches are filtered by max_typos
1278        // and the match span check below instead.
1279        exact_match_bonus: 100,
1280        // gap_open_penalty: 4,
1281        // gap_extend_penalty: 2,
1282        prefix_bonus: 0,
1283        capitalization_bonus: if case_insensitive { 0 } else { 4 },
1284        ..neo_frizbee::Scoring::default()
1285    };
1286
1287    let matcher = neo_frizbee::Matcher::new(
1288        grep_text,
1289        &neo_frizbee::Config {
1290            // Use the real max_typos so frizbee's SIMD prefilter actually rejects non-matching lines (~2 SIMD instructions per line vs full SW scoring).
1291            max_typos: Some(max_typos as u16),
1292            sort: false,
1293            scoring,
1294        },
1295    );
1296
1297    // Minimum score threshold: 50% of a perfect contiguous match.
1298    // With default scoring (match_score=12, matching_case_bonus=4 = 16/char),
1299    // a transposition costs ~5 from a gap, keeping the score well above 50%.
1300    let perfect_score = (grep_text.len() as u16) * 16;
1301    let min_score = (perfect_score * 50) / 100;
1302
1303    // Maximum allowed span of matched characters in the haystack, relative
1304    // to needle length.
1305    //
1306    // We allow up to needle_len * 2 to accommodate fuzzy subsequence
1307    // matches in longer identifiers (e.g. "SortedMap" → "SortedArrayMap"
1308    // has span 13 for needle 9). Quality is enforced by the density and
1309    // gap checks below, not just span alone.
1310    let max_match_span = grep_text.len() * 2;
1311    let needle_len = grep_text.len();
1312
1313    // We scale by needle_len: longer needles tolerate more gaps.
1314    let max_gaps = (needle_len / 4).max(1);
1315
1316    // File-level prefilter: collect unique needle chars (both cases) for
1317    // a fast memchr scan.  If a file doesn't contain enough distinct
1318    // needle characters, skip it entirely — no line splitting needed.
1319    let needle_bytes = grep_text.as_bytes();
1320    let mut unique_needle_chars: Vec<u8> = Vec::new();
1321    for &b in needle_bytes {
1322        let lo = b.to_ascii_lowercase();
1323        let hi = b.to_ascii_uppercase();
1324        if !unique_needle_chars.contains(&lo) {
1325            unique_needle_chars.push(lo);
1326        }
1327        if lo != hi && !unique_needle_chars.contains(&hi) {
1328            unique_needle_chars.push(hi);
1329        }
1330    }
1331    // How many distinct needle chars must appear in the file.
1332    // With max_typos allowed, we need at least (unique_count - max_typos).
1333    let unique_count = {
1334        let mut seen = [false; 256];
1335        for &b in needle_bytes {
1336            seen[b.to_ascii_lowercase() as usize] = true;
1337        }
1338        seen.iter().filter(|&&v| v).count()
1339    };
1340    let min_chars_required = unique_count.saturating_sub(max_typos);
1341
1342    let time_budget = if options.time_budget_ms > 0 {
1343        Some(std::time::Duration::from_millis(options.time_budget_ms))
1344    } else {
1345        None
1346    };
1347    let search_start = std::time::Instant::now();
1348    let budget_exceeded = AtomicBool::new(false);
1349    let max_matches_per_file = options.max_matches_per_file;
1350
1351    // Parallel phase with `map_init`: each rayon worker thread clones the
1352    // matcher once and reuses it across all files that thread processes.
1353    // This avoids per-file clone overhead (CPUID detection + matrix alloc).
1354    let per_file_results: Vec<(usize, &'a FileItem, Vec<GrepMatch>)> = files_to_search
1355        .par_iter()
1356        .enumerate()
1357        .map_init(
1358            || matcher.clone(),
1359            |matcher, (idx, file)| {
1360                if let Some(budget) = time_budget
1361                    && search_start.elapsed() > budget
1362                {
1363                    budget_exceeded.store(true, Ordering::Relaxed);
1364                    return None;
1365                }
1366
1367                let file_bytes = file.get_mmap()?;
1368
1369                // File-level prefilter: check if enough distinct needle chars
1370                // exist anywhere in the file bytes.  Uses memchr for speed.
1371                if min_chars_required > 0 {
1372                    let mut chars_found = 0usize;
1373                    for &ch in &unique_needle_chars {
1374                        if memchr::memchr(ch, file_bytes).is_some() {
1375                            chars_found += 1;
1376                            if chars_found >= min_chars_required {
1377                                break;
1378                            }
1379                        }
1380                    }
1381                    if chars_found < min_chars_required {
1382                        return None;
1383                    }
1384                }
1385
1386                // Validate the whole file as UTF-8 once upfront. Source code
1387                // files are virtually always valid UTF-8; this single check
1388                // replaces per-line from_utf8 calls (~8% of fuzzy grep time).
1389                let file_is_utf8 = std::str::from_utf8(file_bytes).is_ok();
1390
1391                // Reuse grep-searcher's LineStep for SIMD-accelerated line iteration.
1392                let mut stepper = LineStep::new(b'\n', 0, file_bytes.len());
1393                let estimated_lines = (file_bytes.len() / 40).max(64);
1394                let mut file_lines: Vec<&str> = Vec::with_capacity(estimated_lines);
1395                let mut line_meta: Vec<(u64, u64)> = Vec::with_capacity(estimated_lines);
1396                let line_term_lf = grep_matcher::LineTerminator::byte(b'\n');
1397                let line_term_cr = grep_matcher::LineTerminator::byte(b'\r');
1398
1399                let mut line_number: u64 = 1;
1400                while let Some(line_match) = stepper.next_match(file_bytes) {
1401                    let byte_offset = line_match.start() as u64;
1402
1403                    // Strip line terminators (\n, \r).
1404                    let trimmed = lines::without_terminator(
1405                        lines::without_terminator(&file_bytes[line_match], line_term_lf),
1406                        line_term_cr,
1407                    );
1408
1409                    if !trimmed.is_empty() {
1410                        // SAFETY: when the whole file is valid UTF-8, every
1411                        // sub-slice split on ASCII byte boundaries (\n, \r)
1412                        // is also valid UTF-8.
1413                        let line_str = if file_is_utf8 {
1414                            unsafe { std::str::from_utf8_unchecked(trimmed) }
1415                        } else if let Ok(s) = std::str::from_utf8(trimmed) {
1416                            s
1417                        } else {
1418                            line_number += 1;
1419                            continue;
1420                        };
1421                        file_lines.push(line_str);
1422                        line_meta.push((line_number, byte_offset));
1423                    }
1424
1425                    line_number += 1;
1426                }
1427
1428                if file_lines.is_empty() {
1429                    return None;
1430                }
1431
1432                // Single-pass: score + indices in one Smith-Waterman run per line.
1433                let matches_with_indices = matcher.match_list_indices(&file_lines);
1434                let mut file_matches: Vec<GrepMatch> = Vec::new();
1435
1436                for mut match_indices in matches_with_indices {
1437                    if match_indices.score < min_score {
1438                        continue;
1439                    }
1440
1441                    let idx = match_indices.index as usize;
1442                    let raw_line = file_lines[idx];
1443
1444                    let truncated = truncate_display_bytes(raw_line.as_bytes());
1445                    let display_line = if truncated.len() < raw_line.len() {
1446                        // SAFETY: truncate_display_bytes preserves UTF-8 char boundaries
1447                        &raw_line[..truncated.len()]
1448                    } else {
1449                        raw_line
1450                    };
1451
1452                    // If the line was truncated, re-compute indices on the shorter string.
1453                    if display_line.len() < raw_line.len() {
1454                        let Some(re_indices) = matcher
1455                            .match_list_indices(&[display_line])
1456                            .into_iter()
1457                            .next()
1458                        else {
1459                            continue;
1460                        };
1461                        match_indices = re_indices;
1462                    }
1463
1464                    // upstream returns indices in reverse order, sort ascending
1465                    match_indices.indices.sort_unstable();
1466
1467                    // Minimum matched chars: at least (needle_len - 1) characters
1468                    // must appear in the match indices. This allows one missing
1469                    // char (a single typo/transposition) but rejects matches that
1470                    // only hit a partial substring (e.g. "HashMap" for "shcema").
1471                    let min_matched = needle_len.saturating_sub(1).max(1);
1472                    if match_indices.indices.len() < min_matched {
1473                        continue;
1474                    }
1475
1476                    let indices = &match_indices.indices;
1477
1478                    if let (Some(&first), Some(&last)) = (indices.first(), indices.last()) {
1479                        // Span check: reject widely scattered matches.
1480                        let span = last - first + 1;
1481                        if span > max_match_span {
1482                            continue;
1483                        }
1484
1485                        // Density check: matched chars / span must be dense enough.
1486                        // Relaxed for perfect subsequence matches (all needle chars
1487                        // present), stricter when typos are involved.
1488                        let density = (indices.len() * 100) / span;
1489                        let min_density = if indices.len() >= needle_len {
1490                            50 // Perfect subsequence — relaxed
1491                        } else {
1492                            70 // Has typos — stricter
1493                        };
1494                        if density < min_density {
1495                            continue;
1496                        }
1497
1498                        // Gap count check: count discontinuities in the indices.
1499                        let gap_count = indices.windows(2).filter(|w| w[1] != w[0] + 1).count();
1500                        if gap_count > max_gaps {
1501                            continue;
1502                        }
1503                    }
1504
1505                    let (ln, bo) = line_meta[idx];
1506                    let match_byte_offsets =
1507                        char_indices_to_byte_offsets(display_line, &match_indices.indices);
1508                    let col = match_byte_offsets
1509                        .first()
1510                        .map(|r| r.0 as usize)
1511                        .unwrap_or(0);
1512
1513                    file_matches.push(GrepMatch {
1514                        file_index: 0,
1515                        line_number: ln,
1516                        col,
1517                        byte_offset: bo,
1518                        is_definition: options.classify_definitions
1519                            && is_definition_line(display_line),
1520                        line_content: display_line.to_string(),
1521                        match_byte_offsets,
1522                        fuzzy_score: Some(match_indices.score),
1523                        context_before: Vec::new(),
1524                        context_after: Vec::new(),
1525                    });
1526
1527                    if max_matches_per_file != 0 && file_matches.len() >= max_matches_per_file {
1528                        break;
1529                    }
1530                }
1531
1532                if file_matches.is_empty() {
1533                    return None;
1534                }
1535
1536                Some((idx, *file, file_matches))
1537            },
1538        )
1539        .flatten()
1540        .collect();
1541
1542    collect_grep_results(
1543        per_file_results,
1544        files_to_search.len(),
1545        options,
1546        total_files,
1547        filtered_file_count,
1548        None,
1549        budget_exceeded.load(Ordering::Relaxed),
1550    )
1551}
1552
1553/// Perform a grep search across all indexed files.
1554///
1555/// When `query` is empty, returns git-modified/untracked files sorted by
1556/// frecency for the "welcome state" UI.
1557pub fn grep_search<'a>(
1558    files: &'a [FileItem],
1559    query: &FFFQuery<'_>,
1560    options: &GrepSearchOptions,
1561) -> GrepResult<'a> {
1562    let total_files = files.len();
1563
1564    // Extract the grep text and file constraints from the parsed query.
1565    // For grep, the search pattern is the original query with constraint tokens
1566    // removed. All non-constraint text tokens are collected and joined with
1567    // spaces to form the grep pattern:
1568    //   "name = *.rs someth" -> grep "name = someth" with constraint Extension("rs")
1569    let constraints_from_query = &query.constraints[..];
1570
1571    let grep_text = if !matches!(query.fuzzy_query, fff_query_parser::FuzzyQuery::Empty) {
1572        query.grep_text()
1573    } else {
1574        // Constraint-only or empty query — use raw_query for backslash-escape handling.
1575        let t = query.raw_query.trim();
1576        if t.starts_with('\\') && t.len() > 1 {
1577            let suffix = &t[1..];
1578            let parser = QueryParser::new(GrepConfig);
1579            if !parser.parse(suffix).constraints.is_empty() {
1580                suffix.to_string()
1581            } else {
1582                t.to_string()
1583            }
1584        } else {
1585            t.to_string()
1586        }
1587    };
1588
1589    if grep_text.is_empty() {
1590        return GrepResult {
1591            total_files,
1592            filtered_file_count: total_files,
1593            next_file_offset: 0,
1594            matches: Vec::with_capacity(4),
1595            files: Vec::new(),
1596            ..Default::default()
1597        };
1598    }
1599
1600    // Filter, sort, and paginate files (shared across all modes)
1601    let (mut files_to_search, mut filtered_file_count) =
1602        prepare_files_to_search(files, constraints_from_query, options);
1603
1604    // If constraints yielded 0 files and we had a FilePath constraint,
1605    // retry without it — the filename may not exist in this repo.
1606    // Keep the original grep_text (e.g. "ActorAuth") rather than restoring
1607    // the raw query ("nonexistent.rs ActorAuth"), since the search term
1608    // was correctly extracted by the parser.
1609    if files_to_search.is_empty()
1610        && let Some(stripped) = strip_file_path_constraints(constraints_from_query)
1611    {
1612        let (retry_files, retry_count) = prepare_files_to_search(files, &stripped, options);
1613        files_to_search = retry_files;
1614        filtered_file_count = retry_count;
1615    }
1616
1617    if files_to_search.is_empty() {
1618        return GrepResult {
1619            total_files,
1620            filtered_file_count,
1621            next_file_offset: 0,
1622            ..Default::default()
1623        };
1624    }
1625
1626    let case_insensitive = if options.smart_case {
1627        !grep_text.chars().any(|c| c.is_uppercase())
1628    } else {
1629        false
1630    };
1631
1632    let mut regex_fallback_error: Option<String> = None;
1633    let regex = match options.mode {
1634        GrepMode::PlainText => None,
1635        GrepMode::Fuzzy => {
1636            return fuzzy_grep_search(
1637                &grep_text,
1638                &files_to_search,
1639                options,
1640                total_files,
1641                filtered_file_count,
1642                case_insensitive,
1643            );
1644        }
1645        GrepMode::Regex => build_regex(&grep_text, options.smart_case)
1646            .inspect_err(|err| {
1647                tracing::warn!("Regex compilation failed for {}. Error {}", grep_text, err);
1648
1649                regex_fallback_error = Some(err.to_string());
1650            })
1651            .ok(),
1652    };
1653
1654    let is_multiline = has_unescaped_newline_escape(&grep_text);
1655
1656    let effective_pattern = if is_multiline {
1657        replace_unescaped_newline_escapes(&grep_text)
1658    } else {
1659        grep_text.to_string()
1660    };
1661
1662    // Build the finder pattern once — used by PlainTextSink (and as a
1663    // literal-needle fallback anchor when regex compilation fell back to plain).
1664    let finder_pattern: Vec<u8> = if case_insensitive {
1665        effective_pattern.as_bytes().to_ascii_lowercase()
1666    } else {
1667        effective_pattern.as_bytes().to_vec()
1668    };
1669    let finder = memchr::memmem::Finder::new(&finder_pattern);
1670    let pattern_len = finder_pattern.len() as u32;
1671
1672    // `PlainTextMatcher` is used by the grep-searcher engine for line detection.
1673    // `PlainTextSink` / `RegexSink` handle highlight extraction independently.
1674    let plain_matcher = PlainTextMatcher {
1675        needle: &finder_pattern,
1676        case_insensitive,
1677    };
1678
1679    let searcher = {
1680        let mut b = SearcherBuilder::new();
1681        b.line_number(true).multi_line(is_multiline);
1682        b
1683    }
1684    .build();
1685
1686    // Dispatch to the appropriate sink type at the boundary — zero runtime
1687    // branching inside the per-line hot path.
1688    run_file_search(
1689        &files_to_search,
1690        options,
1691        total_files,
1692        filtered_file_count,
1693        regex_fallback_error,
1694        |file_bytes: &[u8], max_matches: usize| {
1695            let state = SinkState {
1696                file_index: 0, // set by run_file_search
1697                matches: Vec::with_capacity(4),
1698                max_matches,
1699                before_context: options.before_context,
1700                after_context: options.after_context,
1701                classify_definitions: options.classify_definitions,
1702            };
1703
1704            match regex {
1705                Some(ref re) => {
1706                    let regex_matcher = RegexMatcher {
1707                        regex: re,
1708                        is_multiline,
1709                    };
1710                    let mut sink = RegexSink { state, re };
1711                    if let Err(e) = searcher.search_slice(&regex_matcher, file_bytes, &mut sink) {
1712                        tracing::error!(error = %e, "Grep (regex) search failed");
1713                    }
1714                    sink.state.matches
1715                }
1716                None => {
1717                    let mut sink = PlainTextSink {
1718                        state,
1719                        finder: &finder,
1720                        pattern_len,
1721                        case_insensitive,
1722                    };
1723                    if let Err(e) = searcher.search_slice(&plain_matcher, file_bytes, &mut sink) {
1724                        tracing::error!(error = %e, "Grep (plain text) search failed");
1725                    }
1726                    sink.state.matches
1727                }
1728            }
1729        },
1730    )
1731}
1732
1733/// Parse a grep query using the GrepConfig parser.
1734pub fn parse_grep_query(query: &str) -> FFFQuery<'_> {
1735    let parser = QueryParser::new(GrepConfig);
1736    parser.parse(query)
1737}
1738
1739fn strip_file_path_constraints<'a>(
1740    constraints: &[Constraint<'a>],
1741) -> Option<fff_query_parser::ConstraintVec<'a>> {
1742    if !constraints
1743        .iter()
1744        .any(|c| matches!(c, Constraint::FilePath(_)))
1745    {
1746        return None;
1747    }
1748
1749    let filtered: fff_query_parser::ConstraintVec<'a> = constraints
1750        .iter()
1751        .filter(|c| !matches!(c, Constraint::FilePath(_)))
1752        .cloned()
1753        .collect();
1754
1755    Some(filtered)
1756}
1757
1758#[cfg(test)]
1759mod tests {
1760    use super::*;
1761
1762    #[test]
1763    fn test_unescaped_newline_detection() {
1764        // Single \n → multiline
1765        assert!(has_unescaped_newline_escape("foo\\nbar"));
1766        // \\n → escaped backslash + literal n, NOT multiline
1767        // (this is what the user types when grepping Rust source with `\\nvim`)
1768        assert!(!has_unescaped_newline_escape("foo\\\\nvim-data"));
1769        // Real-world: source file has literal \\AppData\\Local\\nvim-data
1770        // (double backslash in the file, so user types double backslash)
1771        assert!(!has_unescaped_newline_escape(
1772            r#"format!("{}\\AppData\\Local\\nvim-data","#
1773        ));
1774        // No \n at all
1775        assert!(!has_unescaped_newline_escape("hello world"));
1776        // \\\\n → even number of backslashes before n → NOT multiline
1777        assert!(!has_unescaped_newline_escape("foo\\\\\\\\nbar"));
1778        // \\\n → 3 backslashes: first two pair up, third + n = \n → multiline
1779        assert!(has_unescaped_newline_escape("foo\\\\\\nbar"));
1780    }
1781
1782    #[test]
1783    fn test_replace_unescaped_newline() {
1784        // \n → real newline
1785        assert_eq!(replace_unescaped_newline_escapes("foo\\nbar"), "foo\nbar");
1786        // \\n → preserved as-is
1787        assert_eq!(
1788            replace_unescaped_newline_escapes("foo\\\\nvim"),
1789            "foo\\\\nvim"
1790        );
1791    }
1792
1793    #[test]
1794    fn test_fuzzy_typo_scoring() {
1795        // Mirror the config from fuzzy_grep_search
1796        let needle = "schema";
1797        let max_typos = (needle.len() / 3).min(2); // 2
1798        let config = neo_frizbee::Config {
1799            max_typos: Some(max_typos as u16),
1800            sort: false,
1801            scoring: neo_frizbee::Scoring {
1802                exact_match_bonus: 100,
1803                ..neo_frizbee::Scoring::default()
1804            },
1805        };
1806        let min_matched = needle.len().saturating_sub(1).max(1); // 5
1807        let max_match_span = needle.len() + 4; // 10
1808
1809        // Helper: check if a match would pass our post-filters
1810        let passes = |n: &str, h: &str| -> bool {
1811            let Some(mut mi) = neo_frizbee::match_list_indices(n, &[h], &config)
1812                .into_iter()
1813                .next()
1814            else {
1815                return false;
1816            };
1817            // upstream returns indices in reverse order, sort ascending
1818            mi.indices.sort_unstable();
1819            if mi.indices.len() < min_matched {
1820                return false;
1821            }
1822            if let (Some(&first), Some(&last)) = (mi.indices.first(), mi.indices.last()) {
1823                let span = last - first + 1;
1824                if span > max_match_span {
1825                    return false;
1826                }
1827                let density = (mi.indices.len() * 100) / span;
1828                if density < 70 {
1829                    return false;
1830                }
1831            }
1832            true
1833        };
1834
1835        // Exact match: must pass
1836        assert!(passes("schema", "schema"));
1837        // Exact in longer line: must pass
1838        assert!(passes("schema", "  schema: String,"));
1839        // In identifier: must pass
1840        assert!(passes("schema", "pub fn validate_schema() {}"));
1841        // Transposition: must pass
1842        assert!(passes("shcema", "schema"));
1843        // Partial "ema" only line: must NOT pass
1844        assert!(!passes("schema", "it has ema in it"));
1845        // Completely unrelated: must NOT pass
1846        assert!(!passes("schema", "hello world foo bar"));
1847    }
1848
1849    #[test]
1850    fn test_multi_grep_search() {
1851        use crate::types::FileItem;
1852        use std::io::Write;
1853
1854        // Create temp files with known content
1855        let dir = tempfile::tempdir().unwrap();
1856
1857        // File 1: has "GrepMode" and "GrepMatch"
1858        let file1_path = dir.path().join("grep.rs");
1859        {
1860            let mut f = std::fs::File::create(&file1_path).unwrap();
1861            writeln!(f, "pub enum GrepMode {{").unwrap();
1862            writeln!(f, "    PlainText,").unwrap();
1863            writeln!(f, "    Regex,").unwrap();
1864            writeln!(f, "}}").unwrap();
1865            writeln!(f, "pub struct GrepMatch {{").unwrap();
1866            writeln!(f, "    pub line_number: u64,").unwrap();
1867            writeln!(f, "}}").unwrap();
1868        }
1869
1870        // File 2: has "PlainTextMatcher" only
1871        let file2_path = dir.path().join("matcher.rs");
1872        {
1873            let mut f = std::fs::File::create(&file2_path).unwrap();
1874            writeln!(f, "struct PlainTextMatcher {{").unwrap();
1875            writeln!(f, "    needle: Vec<u8>,").unwrap();
1876            writeln!(f, "}}").unwrap();
1877        }
1878
1879        // File 3: no matches
1880        let file3_path = dir.path().join("other.rs");
1881        {
1882            let mut f = std::fs::File::create(&file3_path).unwrap();
1883            writeln!(f, "fn main() {{").unwrap();
1884            writeln!(f, "    println!(\"hello\");").unwrap();
1885            writeln!(f, "}}").unwrap();
1886        }
1887
1888        let meta1 = std::fs::metadata(&file1_path).unwrap();
1889        let meta2 = std::fs::metadata(&file2_path).unwrap();
1890        let meta3 = std::fs::metadata(&file3_path).unwrap();
1891
1892        let files = vec![
1893            FileItem::new_raw(
1894                file1_path,
1895                "grep.rs".to_string(),
1896                "grep.rs".to_string(),
1897                meta1.len(),
1898                0,
1899                None,
1900                false,
1901            ),
1902            FileItem::new_raw(
1903                file2_path,
1904                "matcher.rs".to_string(),
1905                "matcher.rs".to_string(),
1906                meta2.len(),
1907                0,
1908                None,
1909                false,
1910            ),
1911            FileItem::new_raw(
1912                file3_path,
1913                "other.rs".to_string(),
1914                "other.rs".to_string(),
1915                meta3.len(),
1916                0,
1917                None,
1918                false,
1919            ),
1920        ];
1921
1922        let options = super::GrepSearchOptions {
1923            max_file_size: 10 * 1024 * 1024,
1924            max_matches_per_file: 0,
1925            smart_case: true,
1926            file_offset: 0,
1927            page_limit: 100,
1928            mode: super::GrepMode::PlainText,
1929            time_budget_ms: 0,
1930            before_context: 0,
1931            after_context: 0,
1932            classify_definitions: false,
1933        };
1934
1935        // Test with 3 patterns
1936        let result = super::multi_grep_search(
1937            &files,
1938            &["GrepMode", "GrepMatch", "PlainTextMatcher"],
1939            &[],
1940            &options,
1941        );
1942
1943        // Should find matches from file1 (GrepMode, GrepMatch) and file2 (PlainTextMatcher)
1944        assert!(
1945            result.matches.len() >= 3,
1946            "Expected at least 3 matches, got {}",
1947            result.matches.len()
1948        );
1949
1950        // Verify all 3 patterns are found
1951        let has_grep_mode = result
1952            .matches
1953            .iter()
1954            .any(|m| m.line_content.contains("GrepMode"));
1955        let has_grep_match = result
1956            .matches
1957            .iter()
1958            .any(|m| m.line_content.contains("GrepMatch"));
1959        let has_plain_text_matcher = result
1960            .matches
1961            .iter()
1962            .any(|m| m.line_content.contains("PlainTextMatcher"));
1963
1964        assert!(has_grep_mode, "Should find GrepMode");
1965        assert!(has_grep_match, "Should find GrepMatch");
1966        assert!(has_plain_text_matcher, "Should find PlainTextMatcher");
1967
1968        // File 3 should have no matches
1969        assert_eq!(result.files.len(), 2, "Should match exactly 2 files");
1970
1971        // Test with single pattern
1972        let result2 = super::multi_grep_search(&files, &["PlainTextMatcher"], &[], &options);
1973        assert_eq!(
1974            result2.matches.len(),
1975            1,
1976            "Single pattern should find 1 match"
1977        );
1978
1979        // Test with empty patterns
1980        let result3 = super::multi_grep_search(&files, &[], &[], &options);
1981        assert_eq!(
1982            result3.matches.len(),
1983            0,
1984            "Empty patterns should find nothing"
1985        );
1986    }
1987}
fff_search/grep.rs

fff_search/
grep.rs