1use std::sync::atomic::{AtomicU16, AtomicU64, AtomicUsize, Ordering};
2
3use ahash::AHashMap;
4
5const MAX_BIGRAM_COLUMNS: usize = 5000;
10
11const NO_COLUMN: u16 = u16::MAX;
13
14pub struct BigramIndexBuilder {
17 lookup: Vec<AtomicU16>,
20 col_data: Vec<AtomicU64>,
22 next_column: AtomicU16,
23 words: usize,
24 file_count: usize,
25 populated: AtomicUsize,
26}
27
28impl BigramIndexBuilder {
29 pub fn new(file_count: usize) -> Self {
30 let words = file_count.div_ceil(64);
31 let mut lookup = Vec::with_capacity(65536);
32 lookup.resize_with(65536, || AtomicU16::new(NO_COLUMN));
33 let mut col_data = Vec::with_capacity(MAX_BIGRAM_COLUMNS * words);
34 col_data.resize_with(MAX_BIGRAM_COLUMNS * words, || AtomicU64::new(0));
35 Self {
36 lookup,
37 col_data,
38 next_column: AtomicU16::new(0),
39 words,
40 file_count,
41 populated: AtomicUsize::new(0),
42 }
43 }
44
45 #[inline]
46 fn get_or_alloc_column(&self, key: u16) -> u16 {
47 let current = self.lookup[key as usize].load(Ordering::Relaxed);
48 if current != NO_COLUMN {
49 return current;
50 }
51 let new_col = self.next_column.fetch_add(1, Ordering::Relaxed);
52 if new_col >= MAX_BIGRAM_COLUMNS as u16 {
53 return NO_COLUMN;
54 }
55
56 match self.lookup[key as usize].compare_exchange(
57 NO_COLUMN,
58 new_col,
59 Ordering::Relaxed,
60 Ordering::Relaxed,
61 ) {
62 Ok(_) => new_col,
63 Err(existing) => existing,
64 }
65 }
66
67 #[inline]
68 fn column_bitset(&self, col: u16) -> &[AtomicU64] {
69 let start = col as usize * self.words;
70 &self.col_data[start..start + self.words]
71 }
72
73 pub(crate) fn add_file_content(&self, skip_builder: &Self, file_idx: usize, content: &[u8]) {
74 if content.len() < 2 {
75 return;
76 }
77
78 debug_assert!(file_idx < self.file_count);
79 let word_idx = file_idx / 64;
80 let bit_mask = 1u64 << (file_idx % 64);
81
82 let mut seen_consec = [0u64; 1024];
85 let mut seen_skip = [0u64; 1024];
86
87 let bytes = content;
88 let len = bytes.len();
89
90 let (a, b) = (bytes[0], bytes[1]);
92 if (32..=126).contains(&a) && (32..=126).contains(&b) {
93 let key = (a.to_ascii_lowercase() as u16) << 8 | b.to_ascii_lowercase() as u16;
94 let w = key as usize >> 6;
95 let bit = 1u64 << (key as usize & 63);
96 seen_consec[w] |= bit;
97 let col = self.get_or_alloc_column(key);
98 if col != NO_COLUMN {
99 self.column_bitset(col)[word_idx].fetch_or(bit_mask, Ordering::Relaxed);
100 }
101 }
102
103 for i in 2..len {
105 let cur = bytes[i];
106
107 let prev = bytes[i - 1];
109 if (32..=126).contains(&prev) && (32..=126).contains(&cur) {
110 let key = (prev.to_ascii_lowercase() as u16) << 8 | cur.to_ascii_lowercase() as u16;
111 let w = key as usize >> 6;
112 let bit = 1u64 << (key as usize & 63);
113 if seen_consec[w] & bit == 0 {
114 seen_consec[w] |= bit;
115 let col = self.get_or_alloc_column(key);
116 if col != NO_COLUMN {
117 self.column_bitset(col)[word_idx].fetch_or(bit_mask, Ordering::Relaxed);
118 }
119 }
120 }
121
122 let skip_prev = bytes[i - 2];
124 if (32..=126).contains(&skip_prev) && (32..=126).contains(&cur) {
125 let key =
126 (skip_prev.to_ascii_lowercase() as u16) << 8 | cur.to_ascii_lowercase() as u16;
127 let w = key as usize >> 6;
128 let bit = 1u64 << (key as usize & 63);
129 if seen_skip[w] & bit == 0 {
130 seen_skip[w] |= bit;
131 let col = skip_builder.get_or_alloc_column(key);
132 if col != NO_COLUMN {
133 skip_builder.column_bitset(col)[word_idx]
134 .fetch_or(bit_mask, Ordering::Relaxed);
135 }
136 }
137 }
138 }
139
140 self.populated.fetch_add(1, Ordering::Relaxed);
141 skip_builder.populated.fetch_add(1, Ordering::Relaxed);
142 }
143
144 pub fn is_ready(&self) -> bool {
145 self.populated.load(Ordering::Relaxed) > 0
146 }
147
148 pub fn columns_used(&self) -> u16 {
149 self.next_column
150 .load(Ordering::Relaxed)
151 .min(MAX_BIGRAM_COLUMNS as u16)
152 }
153
154 pub fn compress(self, min_density_pct: Option<u32>) -> BigramFilter {
161 let cols = self.columns_used() as usize;
162 let words = self.words;
163 let file_count = self.file_count;
164 let populated = self.populated.load(Ordering::Relaxed);
165 let dense_bytes = words * 8; let old_lookup = self.lookup;
168 let col_data = self.col_data;
169
170 let mut lookup: Vec<u16> = vec![NO_COLUMN; 65536];
171 let mut dense_data: Vec<u64> = Vec::with_capacity(cols * words);
172 let mut dense_count: usize = 0;
173
174 for key in 0..65536usize {
175 let old_col = old_lookup[key].load(Ordering::Relaxed);
176 if old_col == NO_COLUMN || old_col as usize >= cols {
177 continue;
178 }
179
180 let col_start = old_col as usize * words;
181 let bitset = &col_data[col_start..col_start + words];
182
183 let mut popcount = 0u32;
185 for column in bitset.iter().take(words) {
186 popcount += column.load(Ordering::Relaxed).count_ones();
187 }
188
189 let not_to_rare = if let Some(min_pct) = min_density_pct {
191 populated > 0 && (popcount as usize) * 100 >= populated * min_pct as usize
193 } else {
194 (popcount as usize * 4) >= dense_bytes
196 };
197
198 if !not_to_rare {
199 continue;
200 }
201
202 if populated > 0 && (popcount as usize) * 10 >= populated * 9 {
205 continue;
206 }
207
208 let dense_idx = dense_count as u16;
209 lookup[key] = dense_idx;
210 dense_count += 1;
211
212 for column in bitset.iter().take(words) {
213 dense_data.push(column.load(Ordering::Relaxed));
214 }
215 }
216
217 BigramFilter {
221 lookup,
222 dense_data,
223 dense_count,
224 words,
225 file_count,
226 populated,
227 skip_index: None,
228 }
229 }
230}
231
232unsafe impl Send for BigramIndexBuilder {}
233unsafe impl Sync for BigramIndexBuilder {}
234
235#[derive(Debug)]
238pub struct BigramFilter {
239 lookup: Vec<u16>,
240 dense_data: Vec<u64>, dense_count: usize,
244 words: usize,
245 file_count: usize,
246 populated: usize,
247 skip_index: Option<Box<BigramFilter>>,
252}
253
254#[inline]
257fn bitset_and(result: &mut [u64], bitset: &[u64]) {
258 result
259 .iter_mut()
260 .zip(bitset.iter())
261 .for_each(|(r, b)| *r &= *b);
262}
263
264impl BigramFilter {
265 pub fn query(&self, pattern: &[u8]) -> Option<Vec<u64>> {
268 if pattern.len() < 2 {
269 return None;
270 }
271
272 let mut result = vec![u64::MAX; self.words];
273 if !self.file_count.is_multiple_of(64) {
274 let last = self.words - 1;
275 result[last] = (1u64 << (self.file_count % 64)) - 1;
276 }
277
278 let words = self.words;
279 let mut has_filter = false;
280
281 let mut prev = pattern[0];
282 for &b in &pattern[1..] {
283 if (32..=126).contains(&prev) && (32..=126).contains(&b) {
284 let key = (prev.to_ascii_lowercase() as u16) << 8 | b.to_ascii_lowercase() as u16;
285 let col = self.lookup[key as usize];
286 if col != NO_COLUMN {
287 let offset = col as usize * words;
288 let slice = unsafe { self.dense_data.get_unchecked(offset..offset + words) };
290 bitset_and(&mut result, slice);
291 has_filter = true;
292 }
293 }
294 prev = b;
295 }
296
297 if let Some(skip) = &self.skip_index
299 && pattern.len() >= 3
300 && let Some(skip_candidates) = skip.query_skip(pattern)
301 {
302 bitset_and(&mut result, &skip_candidates);
303 has_filter = true;
304 }
305
306 has_filter.then_some(result)
307 }
308
309 fn query_skip(&self, pattern: &[u8]) -> Option<Vec<u64>> {
312 let mut result = vec![u64::MAX; self.words];
313 if !self.file_count.is_multiple_of(64) {
314 let last = self.words - 1;
315 result[last] = (1u64 << (self.file_count % 64)) - 1;
316 }
317
318 let words = self.words;
319 let mut has_filter = false;
320
321 for i in 0..pattern.len().saturating_sub(2) {
322 let a = pattern[i];
323 let b = pattern[i + 2];
324 if (32..=126).contains(&a) && (32..=126).contains(&b) {
325 let key = (a.to_ascii_lowercase() as u16) << 8 | b.to_ascii_lowercase() as u16;
326 let col = self.lookup[key as usize];
327 if col != NO_COLUMN {
328 let offset = col as usize * words;
329 let slice = unsafe { self.dense_data.get_unchecked(offset..offset + words) };
330 bitset_and(&mut result, slice);
331 has_filter = true;
332 }
333 }
334 }
335
336 has_filter.then_some(result)
337 }
338
339 pub fn set_skip_index(&mut self, skip: BigramFilter) {
341 self.skip_index = Some(Box::new(skip));
342 }
343
344 #[inline]
345 pub fn is_candidate(candidates: &[u64], file_idx: usize) -> bool {
346 let word = file_idx / 64;
347 let bit = file_idx % 64;
348 word < candidates.len() && candidates[word] & (1u64 << bit) != 0
349 }
350
351 pub fn count_candidates(candidates: &[u64]) -> usize {
352 candidates.iter().map(|w| w.count_ones() as usize).sum()
353 }
354
355 pub fn is_ready(&self) -> bool {
356 self.populated > 0
357 }
358
359 pub fn file_count(&self) -> usize {
360 self.file_count
361 }
362
363 pub fn columns_used(&self) -> usize {
364 self.dense_count
365 }
366
367 pub fn heap_bytes(&self) -> usize {
369 let lookup_bytes = self.lookup.len() * std::mem::size_of::<u16>();
370 let dense_bytes = self.dense_data.len() * std::mem::size_of::<u64>();
371 let skip_bytes = self.skip_index.as_ref().map_or(0, |s| s.heap_bytes());
372 lookup_bytes + dense_bytes + skip_bytes
373 }
374
375 pub fn has_key(&self, key: u16) -> bool {
377 self.lookup[key as usize] != NO_COLUMN
378 }
379
380 pub fn lookup(&self) -> &[u16] {
382 &self.lookup
383 }
384
385 pub fn dense_data(&self) -> &[u64] {
387 &self.dense_data
388 }
389
390 pub fn words(&self) -> usize {
392 self.words
393 }
394
395 pub fn dense_count(&self) -> usize {
397 self.dense_count
398 }
399
400 pub fn populated(&self) -> usize {
402 self.populated
403 }
404
405 pub fn skip_index(&self) -> Option<&BigramFilter> {
407 self.skip_index.as_deref()
408 }
409
410 pub fn new(
412 lookup: Vec<u16>,
413 dense_data: Vec<u64>,
414 dense_count: usize,
415 words: usize,
416 file_count: usize,
417 populated: usize,
418 ) -> Self {
419 Self {
420 lookup,
421 dense_data,
422 dense_count,
423 words,
424 file_count,
425 populated,
426 skip_index: None,
427 }
428 }
429}
430
431pub fn extract_bigrams(content: &[u8]) -> Vec<u16> {
432 if content.len() < 2 {
433 return Vec::new();
434 }
435 let mut seen = vec![0u64; 1024]; let mut bigrams = Vec::new();
438
439 let mut prev = content[0];
440 for &b in &content[1..] {
441 if (32..=126).contains(&prev) && (32..=126).contains(&b) {
442 let key = (prev.to_ascii_lowercase() as u16) << 8 | b.to_ascii_lowercase() as u16;
443 let word = key as usize / 64;
444 let bit = 1u64 << (key as usize % 64);
445 if seen[word] & bit == 0 {
446 seen[word] |= bit;
447 bigrams.push(key);
448 }
449 }
450 prev = b;
451 }
452 bigrams
453}
454
455#[derive(Debug)]
460pub struct BigramOverlay {
461 modified: AHashMap<usize, Vec<u16>>,
464
465 tombstones: Vec<u64>,
468
469 base_file_count: usize,
471}
472
473impl BigramOverlay {
474 pub(crate) fn new(base_file_count: usize) -> Self {
475 let words = base_file_count.div_ceil(64);
476 Self {
477 modified: AHashMap::new(),
478 tombstones: vec![0u64; words],
479 base_file_count,
480 }
481 }
482
483 pub(crate) fn modify_file(&mut self, file_idx: usize, content: &[u8]) {
484 self.modified.insert(file_idx, extract_bigrams(content));
485 }
486
487 pub(crate) fn delete_file(&mut self, file_idx: usize) {
488 if file_idx < self.base_file_count {
489 let word = file_idx / 64;
490 self.tombstones[word] |= 1u64 << (file_idx % 64);
491 }
492 self.modified.remove(&file_idx);
493 }
494
495 pub(crate) fn query_modified(&self, pattern_bigrams: &[u16]) -> Vec<usize> {
498 if pattern_bigrams.is_empty() {
499 return self.modified.keys().copied().collect();
500 }
501 self.modified
502 .iter()
503 .filter_map(|(&file_idx, bigrams)| {
504 pattern_bigrams
505 .iter()
506 .all(|pb| bigrams.contains(pb))
507 .then_some(file_idx)
508 })
509 .collect()
510 }
511
512 pub(crate) fn base_file_count(&self) -> usize {
514 self.base_file_count
515 }
516
517 pub(crate) fn tombstones(&self) -> &[u64] {
519 &self.tombstones
520 }
521
522 pub(crate) fn modified_indices(&self) -> Vec<usize> {
525 self.modified.keys().copied().collect()
526 }
527}