use ahash::AHashMap;
use rayon::iter::{IndexedParallelIterator, ParallelIterator};
use rayon::slice::ParallelSlice;
use std::cell::UnsafeCell;
use std::sync::OnceLock;
use std::sync::atomic::{AtomicU16, AtomicUsize, Ordering};
const MAX_BIGRAM_COLUMNS: usize = 5000;
const NO_COLUMN: u16 = u16::MAX;
pub struct BigramIndexBuilder {
lookup: Vec<AtomicU16>,
col_data: OnceLock<UnsafeCell<Box<[u64]>>>,
next_column: AtomicU16,
words: usize,
file_count: usize,
populated: AtomicUsize,
}
unsafe impl Sync for BigramIndexBuilder {}
impl BigramIndexBuilder {
pub fn new(file_count: usize) -> Self {
let words = file_count.div_ceil(64);
let mut lookup = Vec::with_capacity(65536);
lookup.resize_with(65536, || AtomicU16::new(NO_COLUMN));
Self {
lookup,
col_data: OnceLock::new(),
next_column: AtomicU16::new(0),
words,
file_count,
populated: AtomicUsize::new(0),
}
}
#[inline(always)]
fn col_data_cell(&self) -> &UnsafeCell<Box<[u64]>> {
self.col_data.get_or_init(|| {
let total = MAX_BIGRAM_COLUMNS * self.words;
UnsafeCell::new(vec![0u64; total].into_boxed_slice())
})
}
#[inline(always)]
fn col_data_ptr(&self) -> *mut u64 {
unsafe { (*self.col_data_cell().get()).as_mut_ptr() }
}
#[inline]
fn get_or_alloc_column(&self, key: u16) -> u16 {
let current = self.lookup[key as usize].load(Ordering::Relaxed);
if current != NO_COLUMN {
return current;
}
let new_col = self.next_column.fetch_add(1, Ordering::Relaxed);
if new_col >= MAX_BIGRAM_COLUMNS as u16 {
return NO_COLUMN;
}
match self.lookup[key as usize].compare_exchange(
NO_COLUMN,
new_col,
Ordering::Relaxed,
Ordering::Relaxed,
) {
Ok(_) => new_col,
Err(existing) => existing,
}
}
#[inline(always)]
unsafe fn column_word_ptr(&self, col: u16, word_idx: usize) -> *mut u64 {
unsafe {
self.col_data_ptr()
.add(col as usize * self.words + word_idx)
}
}
#[cfg(test)]
fn column_bitset(&self, col: u16) -> &[u64] {
let start = col as usize * self.words;
let slab = unsafe { &*self.col_data_cell().get() };
&slab[start..start + self.words]
}
#[doc(hidden)]
pub fn add_file_content(&self, skip_builder: &Self, file_idx: usize, content: &[u8]) {
if content.len() < 2 {
return;
}
debug_assert!(file_idx < self.file_count);
let word_idx = file_idx / 64;
let bit_mask = 1u64 << (file_idx % 64);
let mut seen_consec = [0u64; 1024];
let mut seen_skip = [0u64; 1024];
let bytes = content;
let len = bytes.len();
let mut n0 = normalize_byte_scalar(bytes[0]);
let mut n1 = normalize_byte_scalar(bytes[1]);
if n0 != u16::MAX && n1 != u16::MAX {
let key = (n0 << 8) | n1;
self.record_bigram(&mut seen_consec, key, word_idx, bit_mask);
}
for &b in &bytes[2..len] {
let cur = normalize_byte_scalar(b);
if cur != u16::MAX {
if n1 != u16::MAX {
let key = (n1 << 8) | cur;
self.record_bigram(&mut seen_consec, key, word_idx, bit_mask);
}
if n0 != u16::MAX {
let key = (n0 << 8) | cur;
skip_builder.record_bigram(&mut seen_skip, key, word_idx, bit_mask);
}
}
n0 = n1;
n1 = cur;
}
self.populated.fetch_add(1, Ordering::Relaxed);
skip_builder.populated.fetch_add(1, Ordering::Relaxed);
}
#[inline(always)]
fn record_bigram(&self, seen: &mut [u64; 1024], key: u16, word_idx: usize, bit_mask: u64) {
let k = key as usize;
let w = k >> 6;
let bit = 1u64 << (k & 63);
if seen[w] & bit == 0 {
seen[w] |= bit;
let col = self.get_or_alloc_column(key);
if col != NO_COLUMN {
unsafe {
let p = self.column_word_ptr(col, word_idx);
*p |= bit_mask;
}
}
}
}
pub fn is_ready(&self) -> bool {
self.populated.load(Ordering::Relaxed) > 0
}
pub fn columns_used(&self) -> u16 {
self.next_column
.load(Ordering::Relaxed)
.min(MAX_BIGRAM_COLUMNS as u16)
}
#[inline(always)]
pub fn compress(self, min_density_pct: Option<u32>) -> BigramFilter {
let cols = self.columns_used() as usize;
let words = self.words;
let file_count = self.file_count;
let populated = self.populated.load(Ordering::Relaxed);
let dense_bytes = words * 8;
let old_lookup = self.lookup;
let col_data: Option<Box<[u64]>> = self.col_data.into_inner().map(UnsafeCell::into_inner);
let mut lookup: Vec<u16> = vec![NO_COLUMN; 65536];
let mut dense_data: Vec<u64> = Vec::with_capacity(cols * words);
let mut dense_count: usize = 0;
if let Some(col_data) = col_data.as_deref() {
for key in 0..65536usize {
let old_col = old_lookup[key].load(Ordering::Relaxed);
if old_col == NO_COLUMN || old_col as usize >= cols {
continue;
}
let col_start = old_col as usize * words;
let bitset = &col_data[col_start..col_start + words];
let mut popcount = 0u32;
for &word in bitset.iter().take(words) {
popcount += word.count_ones();
}
let not_to_rare = if let Some(min_pct) = min_density_pct {
populated > 0 && (popcount as usize) * 100 >= populated * min_pct as usize
} else {
(popcount as usize * 4) >= dense_bytes
};
if !not_to_rare {
continue;
}
if populated > 0 && (popcount as usize) * 10 >= populated * 9 {
continue;
}
let dense_idx = dense_count as u16;
lookup[key] = dense_idx;
dense_count += 1;
dense_data.extend_from_slice(bitset);
}
}
BigramFilter {
lookup,
dense_data,
dense_count,
words,
file_count,
populated,
skip_index: None,
}
}
}
unsafe impl Send for BigramIndexBuilder {}
#[derive(Debug)]
pub struct BigramFilter {
lookup: Vec<u16>,
dense_data: Vec<u64>, dense_count: usize,
words: usize,
file_count: usize,
populated: usize,
skip_index: Option<Box<BigramFilter>>,
}
#[inline]
fn bitset_and(result: &mut [u64], bitset: &[u64]) {
result
.iter_mut()
.zip(bitset.iter())
.for_each(|(r, b)| *r &= *b);
}
impl BigramFilter {
pub fn query(&self, pattern: &[u8]) -> Option<Vec<u64>> {
if pattern.len() < 2 {
return None;
}
let mut result = vec![u64::MAX; self.words];
if !self.file_count.is_multiple_of(64) {
let last = self.words - 1;
result[last] = (1u64 << (self.file_count % 64)) - 1;
}
let words = self.words;
let mut has_filter = false;
let mut prev = pattern[0];
for &b in &pattern[1..] {
if (32..=126).contains(&prev) && (32..=126).contains(&b) {
let key = (prev.to_ascii_lowercase() as u16) << 8 | b.to_ascii_lowercase() as u16;
let col = self.lookup[key as usize];
if col != NO_COLUMN {
let offset = col as usize * words;
let slice = unsafe { self.dense_data.get_unchecked(offset..offset + words) };
bitset_and(&mut result, slice);
has_filter = true;
}
}
prev = b;
}
if let Some(skip) = &self.skip_index
&& pattern.len() >= 3
&& let Some(skip_candidates) = skip.query_skip(pattern)
{
bitset_and(&mut result, &skip_candidates);
has_filter = true;
}
has_filter.then_some(result)
}
fn query_skip(&self, pattern: &[u8]) -> Option<Vec<u64>> {
let mut result = vec![u64::MAX; self.words];
if !self.file_count.is_multiple_of(64) {
let last = self.words - 1;
result[last] = (1u64 << (self.file_count % 64)) - 1;
}
let words = self.words;
let mut has_filter = false;
for i in 0..pattern.len().saturating_sub(2) {
let a = pattern[i];
let b = pattern[i + 2];
if (32..=126).contains(&a) && (32..=126).contains(&b) {
let key = (a.to_ascii_lowercase() as u16) << 8 | b.to_ascii_lowercase() as u16;
let col = self.lookup[key as usize];
if col != NO_COLUMN {
let offset = col as usize * words;
let slice = unsafe { self.dense_data.get_unchecked(offset..offset + words) };
bitset_and(&mut result, slice);
has_filter = true;
}
}
}
has_filter.then_some(result)
}
pub fn set_skip_index(&mut self, skip: BigramFilter) {
self.skip_index = Some(Box::new(skip));
}
#[inline]
pub fn is_candidate(candidates: &[u64], file_idx: usize) -> bool {
let word = file_idx / 64;
let bit = file_idx % 64;
word < candidates.len() && candidates[word] & (1u64 << bit) != 0
}
pub fn count_candidates(candidates: &[u64]) -> usize {
candidates.iter().map(|w| w.count_ones() as usize).sum()
}
pub fn is_ready(&self) -> bool {
self.populated > 0
}
pub fn file_count(&self) -> usize {
self.file_count
}
pub fn columns_used(&self) -> usize {
self.dense_count
}
pub fn heap_bytes(&self) -> usize {
let lookup_bytes = self.lookup.len() * std::mem::size_of::<u16>();
let dense_bytes = self.dense_data.len() * std::mem::size_of::<u64>();
let skip_bytes = self.skip_index.as_ref().map_or(0, |s| s.heap_bytes());
lookup_bytes + dense_bytes + skip_bytes
}
pub fn has_key(&self, key: u16) -> bool {
self.lookup[key as usize] != NO_COLUMN
}
pub fn lookup(&self) -> &[u16] {
&self.lookup
}
pub fn dense_data(&self) -> &[u64] {
&self.dense_data
}
pub fn words(&self) -> usize {
self.words
}
pub fn dense_count(&self) -> usize {
self.dense_count
}
pub fn populated(&self) -> usize {
self.populated
}
pub fn skip_index(&self) -> Option<&BigramFilter> {
self.skip_index.as_deref()
}
pub fn new(
lookup: Vec<u16>,
dense_data: Vec<u64>,
dense_count: usize,
words: usize,
file_count: usize,
populated: usize,
) -> Self {
Self {
lookup,
dense_data,
dense_count,
words,
file_count,
populated,
skip_index: None,
}
}
}
#[inline(always)]
fn normalize_byte_scalar(b: u8) -> u16 {
let printable = b.wrapping_sub(32) <= 94;
let lower = b | ((b.wrapping_sub(b'A') < 26) as u8 * 0x20);
if printable { lower as u16 } else { u16::MAX }
}
pub fn extract_bigrams(content: &[u8]) -> Vec<u16> {
if content.len() < 2 {
return Vec::new();
}
let mut seen = vec![0u64; 1024]; let mut bigrams = Vec::new();
let mut prev = content[0];
for &b in &content[1..] {
if (32..=126).contains(&prev) && (32..=126).contains(&b) {
let key = (prev.to_ascii_lowercase() as u16) << 8 | b.to_ascii_lowercase() as u16;
let word = key as usize / 64;
let bit = 1u64 << (key as usize % 64);
if seen[word] & bit == 0 {
seen[word] |= bit;
bigrams.push(key);
}
}
prev = b;
}
bigrams
}
#[derive(Debug)]
pub struct BigramOverlay {
modified: AHashMap<usize, Vec<u16>>,
tombstones: Vec<u64>,
base_file_count: usize,
}
impl BigramOverlay {
pub(crate) fn new(base_file_count: usize) -> Self {
let words = base_file_count.div_ceil(64);
Self {
modified: AHashMap::new(),
tombstones: vec![0u64; words],
base_file_count,
}
}
pub(crate) fn modify_file(&mut self, file_idx: usize, content: &[u8]) {
self.modified.insert(file_idx, extract_bigrams(content));
}
pub(crate) fn delete_file(&mut self, file_idx: usize) {
if file_idx < self.base_file_count {
let word = file_idx / 64;
self.tombstones[word] |= 1u64 << (file_idx % 64);
}
self.modified.remove(&file_idx);
}
pub(crate) fn query_modified(&self, pattern_bigrams: &[u16]) -> Vec<usize> {
if pattern_bigrams.is_empty() {
return self.modified.keys().copied().collect();
}
self.modified
.iter()
.filter_map(|(&file_idx, bigrams)| {
pattern_bigrams
.iter()
.all(|pb| bigrams.contains(pb))
.then_some(file_idx)
})
.collect()
}
pub(crate) fn base_file_count(&self) -> usize {
self.base_file_count
}
pub(crate) fn tombstones(&self) -> &[u64] {
&self.tombstones
}
pub(crate) fn modified_indices(&self) -> Vec<usize> {
self.modified.keys().copied().collect()
}
}
pub(crate) const MAX_INDEXABLE_FILE_SIZE: usize = 2 * 1024 * 1024;
const BIGRAM_CHUNK_FILES: usize = 4 * 64;
const SKIP_INDEX_MIN_DENSITY_PCT: u32 = 12;
thread_local! {
static READ_BUF: std::cell::RefCell<Box<[u8]>> =
std::cell::RefCell::new(vec![0u8; MAX_INDEXABLE_FILE_SIZE].into_boxed_slice());
}
#[inline]
#[allow(clippy::too_many_arguments)]
fn read_bigram_chunk<'a>(
file: &'a crate::types::FileItem,
base_fd: libc::c_int,
base_path: &std::path::Path,
arena: crate::simd_path::ArenaPtr,
budget: &crate::types::ContentCacheBudget,
warmup: bool,
buf: &'a mut [u8],
path_buf: &mut [u8; crate::simd_path::PATH_BUF_SIZE],
) -> Option<&'a [u8]> {
if warmup
&& !file.is_likely_hot()
&& let Some(cached) = file.get_cached_content(arena, base_path, budget)
{
if crate::file_picker::detect_binary_content(cached) {
file.set_binary(true);
return None;
}
return Some(&cached[..cached.len().min(MAX_INDEXABLE_FILE_SIZE)]);
}
let want = (file.size as usize).min(MAX_INDEXABLE_FILE_SIZE);
let filled = file.read_trimmed_into_buf(base_fd, base_path, arena, path_buf, &mut buf[..want]);
if filled == 0 {
return None;
}
let data = &buf[..filled];
if crate::file_picker::detect_binary_content(data) {
file.set_binary(true);
return None;
}
Some(data)
}
#[tracing::instrument(skip_all, name = "Building Bigram Index", level = tracing::Level::DEBUG)]
pub(crate) fn build_bigram_index(
files: &[crate::types::FileItem],
budget: &crate::types::ContentCacheBudget,
base_path: &std::path::Path,
arena: crate::simd_path::ArenaPtr,
warmup: bool,
) -> BigramFilter {
let builder = BigramIndexBuilder::new(files.len());
let skip_builder = BigramIndexBuilder::new(files.len());
#[cfg(unix)]
let base_fd: libc::c_int = open_base_dir_fd(base_path);
#[cfg(not(unix))]
let base_fd: i32 = -1;
crate::file_picker::BACKGROUND_THREAD_POOL.install(|| {
files
.par_chunks(BIGRAM_CHUNK_FILES)
.enumerate()
.for_each(|(chunk_idx, chunk)| {
let base_idx = chunk_idx * BIGRAM_CHUNK_FILES;
for (offset, file) in chunk.iter().enumerate() {
let file_idx = base_idx + offset;
if file.is_binary() || file.size == 0 {
return;
}
READ_BUF.with(|read_cell| {
let mut buf = read_cell.borrow_mut();
let mut path_buf = [0u8; crate::simd_path::PATH_BUF_SIZE];
if let Some(content) = read_bigram_chunk(
file,
base_fd,
base_path,
arena,
budget,
warmup,
&mut buf[..],
&mut path_buf,
) {
builder.add_file_content(&skip_builder, file_idx, content);
}
});
}
});
});
#[cfg(unix)]
if base_fd >= 0 {
unsafe { libc::close(base_fd) };
}
let mut index = builder.compress(None);
let skip_index = skip_builder.compress(Some(SKIP_INDEX_MIN_DENSITY_PCT));
index.set_skip_index(skip_index);
crate::file_picker::hint_allocator_collect();
index
}
#[cfg(unix)]
fn open_base_dir_fd(base_path: &std::path::Path) -> libc::c_int {
use std::os::unix::ffi::OsStrExt;
let mut cstr = [0u8; crate::simd_path::PATH_BUF_SIZE];
let bytes = base_path.as_os_str().as_bytes();
if bytes.len() >= cstr.len() {
return -1;
}
cstr[..bytes.len()].copy_from_slice(bytes);
unsafe {
libc::open(
cstr.as_ptr() as *const std::os::raw::c_char,
libc::O_RDONLY | libc::O_DIRECTORY,
)
}
}
#[cfg(test)]
mod tests {
use super::*;
fn key(a: u8, b: u8) -> u16 {
((a.to_ascii_lowercase() as u16) << 8) | b.to_ascii_lowercase() as u16
}
fn expected_bigrams(content: &[u8]) -> (Vec<u16>, Vec<u16>) {
let mut consec: std::collections::BTreeSet<u16> = Default::default();
let mut skip: std::collections::BTreeSet<u16> = Default::default();
let printable = |b: u8| (32..=126).contains(&b);
for i in 1..content.len() {
let a = content[i - 1];
let b = content[i];
if printable(a) && printable(b) {
consec.insert(key(a, b));
}
if i >= 2 {
let a = content[i - 2];
let b = content[i];
if printable(a) && printable(b) {
skip.insert(key(a, b));
}
}
}
(consec.into_iter().collect(), skip.into_iter().collect())
}
fn builder_has_key_for_file_0(b: &BigramIndexBuilder, k: u16) -> bool {
let col = b.lookup[k as usize].load(Ordering::Relaxed);
if col == NO_COLUMN {
return false;
}
b.column_bitset(col)[0] & 1 != 0
}
fn run_and_compare(content: &[u8]) {
let consec = BigramIndexBuilder::new(1);
let skip = BigramIndexBuilder::new(1);
consec.add_file_content(&skip, 0, content);
let (expected_consec, expected_skip) = expected_bigrams(content);
for k in &expected_consec {
assert!(
builder_has_key_for_file_0(&consec, *k),
"consec bigram 0x{k:04x} missing for content {content:?}",
);
}
for k in &expected_skip {
assert!(
builder_has_key_for_file_0(&skip, *k),
"skip bigram 0x{k:04x} missing for content {content:?}",
);
}
for k in 0u32..=0xFFFF {
let recorded_consec = builder_has_key_for_file_0(&consec, k as u16);
let recorded_skip = builder_has_key_for_file_0(&skip, k as u16);
if recorded_consec {
assert!(
expected_consec.contains(&(k as u16)),
"unexpected consec bigram 0x{k:04x} in content {content:?}",
);
}
if recorded_skip {
assert!(
expected_skip.contains(&(k as u16)),
"unexpected skip bigram 0x{k:04x} in content {content:?}",
);
}
}
}
#[test]
fn add_file_empty_is_noop() {
let consec = BigramIndexBuilder::new(1);
let skip = BigramIndexBuilder::new(1);
consec.add_file_content(&skip, 0, b"");
assert_eq!(consec.columns_used(), 0);
assert_eq!(skip.columns_used(), 0);
assert_eq!(consec.populated.load(Ordering::Relaxed), 0);
}
#[test]
fn add_file_single_byte_is_noop() {
let consec = BigramIndexBuilder::new(1);
let skip = BigramIndexBuilder::new(1);
consec.add_file_content(&skip, 0, b"a");
assert_eq!(consec.columns_used(), 0);
assert_eq!(skip.columns_used(), 0);
}
#[test]
fn add_file_two_bytes_consec_only() {
run_and_compare(b"ab");
}
#[test]
fn add_file_three_bytes_has_skip() {
run_and_compare(b"abc");
}
#[test]
fn add_file_ascii_words() {
run_and_compare(b"hello world");
run_and_compare(b"the quick brown fox jumps over the lazy dog");
run_and_compare(b"fn main() { println!(\"hi\"); }");
}
#[test]
fn add_file_case_is_lowered() {
let upper = BigramIndexBuilder::new(1);
let upper_skip = BigramIndexBuilder::new(1);
upper.add_file_content(&upper_skip, 0, b"ABC");
let lower = BigramIndexBuilder::new(1);
let lower_skip = BigramIndexBuilder::new(1);
lower.add_file_content(&lower_skip, 0, b"abc");
for k in 0u32..=0xFFFF {
let u = builder_has_key_for_file_0(&upper, k as u16);
let l = builder_has_key_for_file_0(&lower, k as u16);
assert_eq!(u, l, "consec 0x{k:04x}: upper={u} lower={l}");
let u = builder_has_key_for_file_0(&upper_skip, k as u16);
let l = builder_has_key_for_file_0(&lower_skip, k as u16);
assert_eq!(u, l, "skip 0x{k:04x}: upper={u} lower={l}");
}
}
#[test]
fn add_file_rejects_non_printable() {
run_and_compare(b"\0a\0b");
let consec = BigramIndexBuilder::new(1);
let skip = BigramIndexBuilder::new(1);
consec.add_file_content(&skip, 0, b"\0\0\0\0");
assert_eq!(consec.columns_used(), 0);
assert_eq!(skip.columns_used(), 0);
}
#[test]
fn add_file_mixed_printable_and_control() {
run_and_compare(b"a\tb\nc d");
}
#[test]
fn add_file_repeats_are_deduped() {
run_and_compare(b"ababababab");
}
#[test]
fn add_file_tombstone_separation() {
let consec = BigramIndexBuilder::new(2);
let skip = BigramIndexBuilder::new(2);
consec.add_file_content(&skip, 0, b"xy");
consec.add_file_content(&skip, 1, b"zw");
let key_xy = key(b'x', b'y');
let key_zw = key(b'z', b'w');
let col_xy = consec.lookup[key_xy as usize].load(Ordering::Relaxed);
let col_zw = consec.lookup[key_zw as usize].load(Ordering::Relaxed);
let bitset_xy = consec.column_bitset(col_xy)[0];
let bitset_zw = consec.column_bitset(col_zw)[0];
assert_eq!(bitset_xy & 0b01, 0b01, "file 0 should have xy");
assert_eq!(bitset_zw & 0b01, 0, "file 0 should NOT have zw");
assert_eq!(bitset_xy & 0b10, 0, "file 1 should NOT have xy");
assert_eq!(bitset_zw & 0b10, 0b10, "file 1 should have zw");
}
#[test]
fn add_file_long_content() {
let mut buf = Vec::with_capacity(8192);
for i in 0..8192 {
buf.push(32u8 + ((i * 7) % 95) as u8); }
run_and_compare(&buf);
}
#[test]
fn add_file_simd_and_scalar_agree() {
let mut mixed = Vec::with_capacity(256);
for i in 0..256usize {
mixed.push(match i % 9 {
0 => 0, 1 => 0x7F, 2 => b'\n', _ => 32 + ((i * 13) % 95) as u8,
});
}
run_and_compare(&mixed[..127]); run_and_compare(&mixed); run_and_compare(&mixed[..192]); }
#[test]
fn add_file_respects_file_count_boundary() {
let consec = BigramIndexBuilder::new(100);
let skip = BigramIndexBuilder::new(100);
consec.add_file_content(&skip, 63, b"ab");
consec.add_file_content(&skip, 64, b"cd");
let kab = key(b'a', b'b');
let kcd = key(b'c', b'd');
let col_ab = consec.lookup[kab as usize].load(Ordering::Relaxed);
let col_cd = consec.lookup[kcd as usize].load(Ordering::Relaxed);
let ab_bitset = consec.column_bitset(col_ab);
let cd_bitset = consec.column_bitset(col_cd);
assert_eq!(ab_bitset[0], 1u64 << 63);
assert_eq!(ab_bitset[1], 0);
assert_eq!(cd_bitset[0], 0);
assert_eq!(cd_bitset[1], 1);
}
}