fff_search/
bigram_filter.rs

1use std::sync::atomic::{AtomicU16, AtomicU64, AtomicUsize, Ordering};
2
3use ahash::AHashMap;
4
5/// Maximum number of distinct bigrams tracked in the inverted index.
6/// 95 printable ASCII chars (32..=126) after lowercasing → ~70 distinct → 4900 possible.
7/// We cap at 5000 to cover all printable bigrams with margin.
8/// 5000 columns × 62.5KB (500k files) = 305MB. For 50k files: 30MB.
9const MAX_BIGRAM_COLUMNS: usize = 5000;
10
11/// Sentinel value: bigram has no allocated column.
12const NO_COLUMN: u16 = u16::MAX;
13
14/// Temporary sync dense builder for the bigram index.
15/// Builds from the many threads reading file contents in parallel
16pub struct BigramIndexBuilder {
17    // we use lookup as atomics only in the builder because it is filled by the rayon threads
18    // the actual index uses pure u16 for the allocations
19    lookup: Vec<AtomicU16>,
20    /// Per-column bitset data, lazily allocated via OnceLock.
21    col_data: Vec<AtomicU64>,
22    next_column: AtomicU16,
23    words: usize,
24    file_count: usize,
25    populated: AtomicUsize,
26}
27
28impl BigramIndexBuilder {
29    pub fn new(file_count: usize) -> Self {
30        let words = file_count.div_ceil(64);
31        let mut lookup = Vec::with_capacity(65536);
32        lookup.resize_with(65536, || AtomicU16::new(NO_COLUMN));
33        let mut col_data = Vec::with_capacity(MAX_BIGRAM_COLUMNS * words);
34        col_data.resize_with(MAX_BIGRAM_COLUMNS * words, || AtomicU64::new(0));
35        Self {
36            lookup,
37            col_data,
38            next_column: AtomicU16::new(0),
39            words,
40            file_count,
41            populated: AtomicUsize::new(0),
42        }
43    }
44
45    #[inline]
46    fn get_or_alloc_column(&self, key: u16) -> u16 {
47        let current = self.lookup[key as usize].load(Ordering::Relaxed);
48        if current != NO_COLUMN {
49            return current;
50        }
51        let new_col = self.next_column.fetch_add(1, Ordering::Relaxed);
52        if new_col >= MAX_BIGRAM_COLUMNS as u16 {
53            return NO_COLUMN;
54        }
55
56        match self.lookup[key as usize].compare_exchange(
57            NO_COLUMN,
58            new_col,
59            Ordering::Relaxed,
60            Ordering::Relaxed,
61        ) {
62            Ok(_) => new_col,
63            Err(existing) => existing,
64        }
65    }
66
67    #[inline]
68    fn column_bitset(&self, col: u16) -> &[AtomicU64] {
69        let start = col as usize * self.words;
70        &self.col_data[start..start + self.words]
71    }
72
73    pub(crate) fn add_file_content(&self, skip_builder: &Self, file_idx: usize, content: &[u8]) {
74        if content.len() < 2 {
75            return;
76        }
77
78        debug_assert!(file_idx < self.file_count);
79        let word_idx = file_idx / 64;
80        let bit_mask = 1u64 << (file_idx % 64);
81
82        // Stack-local dedup bitsets: 1024 × u64 = 8 KB each, covers all 65536 bigrams with margin
83        // have to fit in L1 cache
84        let mut seen_consec = [0u64; 1024];
85        let mut seen_skip = [0u64; 1024];
86
87        let bytes = content;
88        let len = bytes.len();
89
90        // First consecutive pair (no skip bigram possible yet).
91        let (a, b) = (bytes[0], bytes[1]);
92        if (32..=126).contains(&a) && (32..=126).contains(&b) {
93            let key = (a.to_ascii_lowercase() as u16) << 8 | b.to_ascii_lowercase() as u16;
94            let w = key as usize >> 6;
95            let bit = 1u64 << (key as usize & 63);
96            seen_consec[w] |= bit;
97            let col = self.get_or_alloc_column(key);
98            if col != NO_COLUMN {
99                self.column_bitset(col)[word_idx].fetch_or(bit_mask, Ordering::Relaxed);
100            }
101        }
102
103        // Main loop: consecutive (i-1, i) and skip-1 (i-2, i)
104        for i in 2..len {
105            let cur = bytes[i];
106
107            // Consecutive bigram: (bytes[i-1], bytes[i])
108            let prev = bytes[i - 1];
109            if (32..=126).contains(&prev) && (32..=126).contains(&cur) {
110                let key = (prev.to_ascii_lowercase() as u16) << 8 | cur.to_ascii_lowercase() as u16;
111                let w = key as usize >> 6;
112                let bit = 1u64 << (key as usize & 63);
113                if seen_consec[w] & bit == 0 {
114                    seen_consec[w] |= bit;
115                    let col = self.get_or_alloc_column(key);
116                    if col != NO_COLUMN {
117                        self.column_bitset(col)[word_idx].fetch_or(bit_mask, Ordering::Relaxed);
118                    }
119                }
120            }
121
122            // Skip-1 bigram: (bytes[i-2], bytes[i])
123            let skip_prev = bytes[i - 2];
124            if (32..=126).contains(&skip_prev) && (32..=126).contains(&cur) {
125                let key =
126                    (skip_prev.to_ascii_lowercase() as u16) << 8 | cur.to_ascii_lowercase() as u16;
127                let w = key as usize >> 6;
128                let bit = 1u64 << (key as usize & 63);
129                if seen_skip[w] & bit == 0 {
130                    seen_skip[w] |= bit;
131                    let col = skip_builder.get_or_alloc_column(key);
132                    if col != NO_COLUMN {
133                        skip_builder.column_bitset(col)[word_idx]
134                            .fetch_or(bit_mask, Ordering::Relaxed);
135                    }
136                }
137            }
138        }
139
140        self.populated.fetch_add(1, Ordering::Relaxed);
141        skip_builder.populated.fetch_add(1, Ordering::Relaxed);
142    }
143
144    pub fn is_ready(&self) -> bool {
145        self.populated.load(Ordering::Relaxed) > 0
146    }
147
148    pub fn columns_used(&self) -> u16 {
149        self.next_column
150            .load(Ordering::Relaxed)
151            .min(MAX_BIGRAM_COLUMNS as u16)
152    }
153
154    /// Compress the dense builder into a compact `BigramFilter`.
155    ///
156    /// Retains columns where the bigram appears in ≥`min_density_pct`% (or
157    /// the default ~3.1% heuristic when `None`) and <90% of indexed files.
158    /// Sparse columns carry too little data to justify their memory;
159    /// ubiquitous columns (≥90%) are nearly all-ones and barely filter.
160    pub fn compress(self, min_density_pct: Option<u32>) -> BigramFilter {
161        let cols = self.columns_used() as usize;
162        let words = self.words;
163        let file_count = self.file_count;
164        let populated = self.populated.load(Ordering::Relaxed);
165        let dense_bytes = words * 8; // cost of one dense column
166
167        let old_lookup = self.lookup;
168        let col_data = self.col_data;
169
170        let mut lookup: Vec<u16> = vec![NO_COLUMN; 65536];
171        let mut dense_data: Vec<u64> = Vec::with_capacity(cols * words);
172        let mut dense_count: usize = 0;
173
174        for key in 0..65536usize {
175            let old_col = old_lookup[key].load(Ordering::Relaxed);
176            if old_col == NO_COLUMN || old_col as usize >= cols {
177                continue;
178            }
179
180            let col_start = old_col as usize * words;
181            let bitset = &col_data[col_start..col_start + words];
182
183            // count set bits to decide if this column is worth keeping.
184            let mut popcount = 0u32;
185            for column in bitset.iter().take(words) {
186                popcount += column.load(Ordering::Relaxed).count_ones();
187            }
188
189            // drop bigrams appearing in too few files
190            let not_to_rare = if let Some(min_pct) = min_density_pct {
191                // Percentage-based: require ≥ min_pct% of populated files.
192                populated > 0 && (popcount as usize) * 100 >= populated * min_pct as usize
193            } else {
194                // Default: popcount ≥ words × 2 (~3.1% of files).
195                (popcount as usize * 4) >= dense_bytes
196            };
197
198            if !not_to_rare {
199                continue;
200            }
201
202            // Drop ubiquitous bigrams — columns ≥90% ones carry almost no
203            // filtering power and just waste memory + AND cycles.
204            if populated > 0 && (popcount as usize) * 10 >= populated * 9 {
205                continue;
206            }
207
208            let dense_idx = dense_count as u16;
209            lookup[key] = dense_idx;
210            dense_count += 1;
211
212            for column in bitset.iter().take(words) {
213                dense_data.push(column.load(Ordering::Relaxed));
214            }
215        }
216
217        // col_data + old_lookup dropped here — single deallocation each,
218        // no fragmentation.
219
220        BigramFilter {
221            lookup,
222            dense_data,
223            dense_count,
224            words,
225            file_count,
226            populated,
227            skip_index: None,
228        }
229    }
230}
231
232unsafe impl Send for BigramIndexBuilder {}
233unsafe impl Sync for BigramIndexBuilder {}
234
235/// Inverted bigram index with optional "skip-1" extension
236/// Copmressed into bitset for minimal usage, the layout of this struct actually matters
237#[derive(Debug)]
238pub struct BigramFilter {
239    lookup: Vec<u16>,
240    /// Flat buffer of all dense column data laid out at fixed stride `words`.
241    /// Column `i` starts at `i * words`.
242    dense_data: Vec<u64>, // do not try to change this to u8 it has to be wordsize
243    dense_count: usize,
244    words: usize,
245    file_count: usize,
246    populated: usize,
247    /// Optional skip-1 bigram index (stride 2). Built from character pairs
248    /// at distance 2, e.g. "ABCDE" → (A,C),(B,D),(C,E). ANDead with the
249    /// consecutive bigram candidates during query to dramatically reduce
250    /// false positives.
251    skip_index: Option<Box<BigramFilter>>,
252}
253
254/// SIMD-friendly bitwise AND of two equal-length bitsets.
255// Auto vectorized (don't touch)
256#[inline]
257fn bitset_and(result: &mut [u64], bitset: &[u64]) {
258    result
259        .iter_mut()
260        .zip(bitset.iter())
261        .for_each(|(r, b)| *r &= *b);
262}
263
264impl BigramFilter {
265    /// AND the posting lists for all query bigrams (consecutive + skip).
266    /// Returns None if no query bigrams are tracked.
267    pub fn query(&self, pattern: &[u8]) -> Option<Vec<u64>> {
268        if pattern.len() < 2 {
269            return None;
270        }
271
272        let mut result = vec![u64::MAX; self.words];
273        if !self.file_count.is_multiple_of(64) {
274            let last = self.words - 1;
275            result[last] = (1u64 << (self.file_count % 64)) - 1;
276        }
277
278        let words = self.words;
279        let mut has_filter = false;
280
281        let mut prev = pattern[0];
282        for &b in &pattern[1..] {
283            if (32..=126).contains(&prev) && (32..=126).contains(&b) {
284                let key = (prev.to_ascii_lowercase() as u16) << 8 | b.to_ascii_lowercase() as u16;
285                let col = self.lookup[key as usize];
286                if col != NO_COLUMN {
287                    let offset = col as usize * words;
288                    // SAFETY: compress() guarantees offset + words <= dense_data.len()
289                    let slice = unsafe { self.dense_data.get_unchecked(offset..offset + words) };
290                    bitset_and(&mut result, slice);
291                    has_filter = true;
292                }
293            }
294            prev = b;
295        }
296
297        // strid-1 bigrams
298        if let Some(skip) = &self.skip_index
299            && pattern.len() >= 3
300            && let Some(skip_candidates) = skip.query_skip(pattern)
301        {
302            bitset_and(&mut result, &skip_candidates);
303            has_filter = true;
304        }
305
306        has_filter.then_some(result)
307    }
308
309    /// Query using stride-2 bigrams from the pattern.
310    /// For "ABCDE" queries with keys (A,C), (B,D), (C,E).
311    fn query_skip(&self, pattern: &[u8]) -> Option<Vec<u64>> {
312        let mut result = vec![u64::MAX; self.words];
313        if !self.file_count.is_multiple_of(64) {
314            let last = self.words - 1;
315            result[last] = (1u64 << (self.file_count % 64)) - 1;
316        }
317
318        let words = self.words;
319        let mut has_filter = false;
320
321        for i in 0..pattern.len().saturating_sub(2) {
322            let a = pattern[i];
323            let b = pattern[i + 2];
324            if (32..=126).contains(&a) && (32..=126).contains(&b) {
325                let key = (a.to_ascii_lowercase() as u16) << 8 | b.to_ascii_lowercase() as u16;
326                let col = self.lookup[key as usize];
327                if col != NO_COLUMN {
328                    let offset = col as usize * words;
329                    let slice = unsafe { self.dense_data.get_unchecked(offset..offset + words) };
330                    bitset_and(&mut result, slice);
331                    has_filter = true;
332                }
333            }
334        }
335
336        has_filter.then_some(result)
337    }
338
339    /// Attach a skip-1 bigram index for tighter candidate filtering.
340    pub fn set_skip_index(&mut self, skip: BigramFilter) {
341        self.skip_index = Some(Box::new(skip));
342    }
343
344    #[inline]
345    pub fn is_candidate(candidates: &[u64], file_idx: usize) -> bool {
346        let word = file_idx / 64;
347        let bit = file_idx % 64;
348        word < candidates.len() && candidates[word] & (1u64 << bit) != 0
349    }
350
351    pub fn count_candidates(candidates: &[u64]) -> usize {
352        candidates.iter().map(|w| w.count_ones() as usize).sum()
353    }
354
355    pub fn is_ready(&self) -> bool {
356        self.populated > 0
357    }
358
359    pub fn file_count(&self) -> usize {
360        self.file_count
361    }
362
363    pub fn columns_used(&self) -> usize {
364        self.dense_count
365    }
366
367    /// Total heap bytes used by this index (lookup + dense data + skip).
368    pub fn heap_bytes(&self) -> usize {
369        let lookup_bytes = self.lookup.len() * std::mem::size_of::<u16>();
370        let dense_bytes = self.dense_data.len() * std::mem::size_of::<u64>();
371        let skip_bytes = self.skip_index.as_ref().map_or(0, |s| s.heap_bytes());
372        lookup_bytes + dense_bytes + skip_bytes
373    }
374
375    /// Check whether a bigram key is present in this index.
376    pub fn has_key(&self, key: u16) -> bool {
377        self.lookup[key as usize] != NO_COLUMN
378    }
379
380    /// Raw lookup table (65536 entries mapping bigram key → column index).
381    pub fn lookup(&self) -> &[u16] {
382        &self.lookup
383    }
384
385    /// Flat dense bitset data at fixed stride `words`.
386    pub fn dense_data(&self) -> &[u64] {
387        &self.dense_data
388    }
389
390    /// Number of u64 words per column (= ceil(file_count / 64)).
391    pub fn words(&self) -> usize {
392        self.words
393    }
394
395    /// Number of dense columns retained after compression.
396    pub fn dense_count(&self) -> usize {
397        self.dense_count
398    }
399
400    /// Number of files that contributed content to the index.
401    pub fn populated(&self) -> usize {
402        self.populated
403    }
404
405    /// Reference to the optional skip-1 bigram sub-index.
406    pub fn skip_index(&self) -> Option<&BigramFilter> {
407        self.skip_index.as_deref()
408    }
409
410    /// Create a new bigram filter from the internal data
411    pub fn new(
412        lookup: Vec<u16>,
413        dense_data: Vec<u64>,
414        dense_count: usize,
415        words: usize,
416        file_count: usize,
417        populated: usize,
418    ) -> Self {
419        Self {
420            lookup,
421            dense_data,
422            dense_count,
423            words,
424            file_count,
425            populated,
426            skip_index: None,
427        }
428    }
429}
430
431pub fn extract_bigrams(content: &[u8]) -> Vec<u16> {
432    if content.len() < 2 {
433        return Vec::new();
434    }
435    // Use a flat bitset (65536 bits = 8 KB) for dedup — faster than HashSet.
436    let mut seen = vec![0u64; 1024]; // 1024 * 64 = 65536 bits
437    let mut bigrams = Vec::new();
438
439    let mut prev = content[0];
440    for &b in &content[1..] {
441        if (32..=126).contains(&prev) && (32..=126).contains(&b) {
442            let key = (prev.to_ascii_lowercase() as u16) << 8 | b.to_ascii_lowercase() as u16;
443            let word = key as usize / 64;
444            let bit = 1u64 << (key as usize % 64);
445            if seen[word] & bit == 0 {
446                seen[word] |= bit;
447                bigrams.push(key);
448            }
449        }
450        prev = b;
451    }
452    bigrams
453}
454
455/// Modified and added files store their own bigram sets. Deleted files are
456/// tombstoned in a bitset so they can be excluded from base query results.
457/// This overlay is updated by the background watcher on every file event
458/// and cleared when the base index is rebuilt.
459#[derive(Debug)]
460pub struct BigramOverlay {
461    /// Per-file bigram sets for files modified since the base was built.
462    /// Key = file index in the base `Vec<FileItem>`.
463    modified: AHashMap<usize, Vec<u16>>,
464
465    /// Tombstone bitset — one bit per base file. Set bits are excluded
466    /// from base query results.
467    tombstones: Vec<u64>,
468
469    /// Original files count this overlay was created for.
470    base_file_count: usize,
471}
472
473impl BigramOverlay {
474    pub(crate) fn new(base_file_count: usize) -> Self {
475        let words = base_file_count.div_ceil(64);
476        Self {
477            modified: AHashMap::new(),
478            tombstones: vec![0u64; words],
479            base_file_count,
480        }
481    }
482
483    pub(crate) fn modify_file(&mut self, file_idx: usize, content: &[u8]) {
484        self.modified.insert(file_idx, extract_bigrams(content));
485    }
486
487    pub(crate) fn delete_file(&mut self, file_idx: usize) {
488        if file_idx < self.base_file_count {
489            let word = file_idx / 64;
490            self.tombstones[word] |= 1u64 << (file_idx % 64);
491        }
492        self.modified.remove(&file_idx);
493    }
494
495    /// Return base file indices of modified files whose bigrams match ALL
496    /// of the given `pattern_bigrams`.
497    pub(crate) fn query_modified(&self, pattern_bigrams: &[u16]) -> Vec<usize> {
498        if pattern_bigrams.is_empty() {
499            return self.modified.keys().copied().collect();
500        }
501        self.modified
502            .iter()
503            .filter_map(|(&file_idx, bigrams)| {
504                pattern_bigrams
505                    .iter()
506                    .all(|pb| bigrams.contains(pb))
507                    .then_some(file_idx)
508            })
509            .collect()
510    }
511
512    /// Number of base files this overlay was created for.
513    pub(crate) fn base_file_count(&self) -> usize {
514        self.base_file_count
515    }
516
517    /// Get the tombstone bitset for clearing base candidates.
518    pub(crate) fn tombstones(&self) -> &[u64] {
519        &self.tombstones
520    }
521
522    /// Get all modified file indices (for conservative overlay merging when
523    /// we can't extract precise bigrams, e.g. regex patterns).
524    pub(crate) fn modified_indices(&self) -> Vec<usize> {
525        self.modified.keys().copied().collect()
526    }
527}
fff_search/bigram_filter.rs

fff_search/
bigram_filter.rs