use std::path::{Path, PathBuf};
use std::sync::OnceLock;
use std::sync::atomic::{AtomicU32, AtomicU64, AtomicUsize, Ordering};
use crate::constraints::Constrainable;
use crate::query_tracker::QueryMatchEntry;
use ahash::AHashMap;
use fff_query_parser::{FFFQuery, FuzzyQuery, Location};
#[derive(Debug)]
#[allow(dead_code)] pub enum FileContent {
#[cfg(not(target_os = "windows"))]
Mmap(memmap2::Mmap),
Buffer(Vec<u8>),
}
impl std::ops::Deref for FileContent {
type Target = [u8];
fn deref(&self) -> &[u8] {
match self {
#[cfg(not(target_os = "windows"))]
FileContent::Mmap(m) => m,
FileContent::Buffer(b) => b,
}
}
}
#[derive(Debug)]
pub struct FileItem {
pub path: PathBuf,
pub relative_path: String,
pub file_name: String,
pub size: u64,
pub modified: u64,
pub access_frecency_score: i32,
pub modification_frecency_score: i32,
pub total_frecency_score: i32,
pub git_status: Option<git2::Status>,
pub is_binary: bool,
pub is_deleted: bool,
content: OnceLock<FileContent>,
}
impl Clone for FileItem {
fn clone(&self) -> Self {
Self {
path: self.path.clone(),
relative_path: self.relative_path.clone(),
file_name: self.file_name.clone(),
size: self.size,
modified: self.modified,
access_frecency_score: self.access_frecency_score,
modification_frecency_score: self.modification_frecency_score,
total_frecency_score: self.total_frecency_score,
git_status: self.git_status,
is_binary: self.is_binary,
is_deleted: self.is_deleted,
content: OnceLock::new(),
}
}
}
pub enum FileContentRef<'a> {
Cached(&'a [u8]),
Temp(FileContent),
}
impl std::ops::Deref for FileContentRef<'_> {
type Target = [u8];
fn deref(&self) -> &[u8] {
match self {
FileContentRef::Cached(s) => s,
FileContentRef::Temp(c) => c,
}
}
}
impl FileItem {
pub fn new_raw(
path: PathBuf,
relative_path: String,
file_name: String,
size: u64,
modified: u64,
git_status: Option<git2::Status>,
is_binary: bool,
) -> Self {
Self {
path,
relative_path,
file_name,
size,
modified,
access_frecency_score: 0,
modification_frecency_score: 0,
total_frecency_score: 0,
git_status,
is_binary,
is_deleted: false,
content: OnceLock::new(),
}
}
pub fn invalidate_mmap(&mut self, budget: &ContentCacheBudget) {
if self.content.get().is_some() {
budget.cached_count.fetch_sub(1, Ordering::Relaxed);
budget.cached_bytes.fetch_sub(self.size, Ordering::Relaxed);
}
self.content = OnceLock::new();
}
pub fn get_content(&self, budget: &ContentCacheBudget) -> Option<&[u8]> {
if let Some(content) = self.content.get() {
return Some(content);
}
let max_file_size = budget.max_file_size;
if self.size == 0 || self.size > max_file_size {
return None;
}
let count = budget.cached_count.load(Ordering::Relaxed);
let bytes = budget.cached_bytes.load(Ordering::Relaxed);
let max_files = budget.max_files;
let max_bytes = budget.max_bytes;
if count >= max_files || bytes + self.size > max_bytes {
return None;
}
let content = load_file_content(&self.path, self.size)?;
let result = self.content.get_or_init(|| content);
budget.cached_count.fetch_add(1, Ordering::Relaxed);
budget.cached_bytes.fetch_add(self.size, Ordering::Relaxed);
Some(result)
}
#[inline]
pub fn get_content_for_search<'a>(
&'a self,
budget: &ContentCacheBudget,
) -> Option<FileContentRef<'a>> {
if let Some(cached) = self.get_content(budget) {
return Some(FileContentRef::Cached(cached));
}
let max_file_size = budget.max_file_size;
if self.is_binary || self.size == 0 || self.size > max_file_size {
return None;
}
let content = load_file_content(&self.path, self.size)?;
Some(FileContentRef::Temp(content))
}
}
const MAX_BIGRAM_COLUMNS: usize = 5000;
const NO_COLUMN: u32 = u32::MAX;
#[cfg(target_arch = "aarch64")]
const MMAP_THRESHOLD: u64 = 16 * 1024;
#[cfg(not(target_arch = "aarch64"))]
const MMAP_THRESHOLD: u64 = 4 * 1024;
fn load_file_content(path: &Path, size: u64) -> Option<FileContent> {
#[cfg(not(target_os = "windows"))]
{
if size < MMAP_THRESHOLD {
let data = std::fs::read(path).ok()?;
Some(FileContent::Buffer(data))
} else {
let file = std::fs::File::open(path).ok()?;
let mmap = unsafe { memmap2::Mmap::map(&file) }.ok()?;
Some(FileContent::Mmap(mmap))
}
}
#[cfg(target_os = "windows")]
{
let _ = size;
let data = std::fs::read(path).ok()?;
Some(FileContent::Buffer(data))
}
}
impl Constrainable for FileItem {
#[inline]
fn relative_path(&self) -> &str {
&self.relative_path
}
#[inline]
fn file_name(&self) -> &str {
&self.file_name
}
#[inline]
fn git_status(&self) -> Option<git2::Status> {
self.git_status
}
}
#[derive(Debug, Clone, Default)]
pub struct Score {
pub total: i32,
pub base_score: i32,
pub filename_bonus: i32,
pub special_filename_bonus: i32,
pub frecency_boost: i32,
pub git_status_boost: i32,
pub distance_penalty: i32,
pub current_file_penalty: i32,
pub combo_match_boost: i32,
pub exact_match: bool,
pub match_type: &'static str,
}
#[derive(Debug, Clone, Copy)]
pub struct PaginationArgs {
pub offset: usize,
pub limit: usize,
}
impl Default for PaginationArgs {
fn default() -> Self {
Self {
offset: 0,
limit: 100,
}
}
}
#[derive(Debug, Clone)]
pub struct ScoringContext<'a> {
pub query: &'a FFFQuery<'a>,
pub project_path: Option<&'a Path>,
pub current_file: Option<&'a str>,
pub max_typos: u16,
pub max_threads: usize,
pub last_same_query_match: Option<QueryMatchEntry>,
pub combo_boost_score_multiplier: i32,
pub min_combo_count: u32,
pub pagination: PaginationArgs,
}
impl ScoringContext<'_> {
pub fn effective_query(&self) -> &str {
match &self.query.fuzzy_query {
FuzzyQuery::Text(t) => t,
FuzzyQuery::Parts(parts) if !parts.is_empty() => parts[0],
_ => self.query.raw_query.trim(),
}
}
}
#[derive(Debug, Clone, Default)]
pub struct SearchResult<'a> {
pub items: Vec<&'a FileItem>,
pub scores: Vec<Score>,
pub total_matched: usize,
pub total_files: usize,
pub location: Option<Location>,
}
const MAX_MMAP_FILE_SIZE: u64 = 10 * 1024 * 1024;
const MAX_CACHED_CONTENT_BYTES: u64 = 512 * 1024 * 1024;
#[derive(Debug)]
pub struct ContentCacheBudget {
pub max_files: usize,
pub max_bytes: u64,
pub max_file_size: u64,
pub cached_count: AtomicUsize,
pub cached_bytes: AtomicU64,
}
impl ContentCacheBudget {
pub fn unlimited() -> Self {
Self {
max_files: usize::MAX,
max_bytes: u64::MAX,
max_file_size: MAX_MMAP_FILE_SIZE,
cached_count: AtomicUsize::new(0),
cached_bytes: AtomicU64::new(0),
}
}
pub fn zero() -> Self {
Self {
max_files: 0,
max_bytes: 0,
max_file_size: 0,
cached_count: AtomicUsize::new(0),
cached_bytes: AtomicU64::new(0),
}
}
pub fn new_for_repo(file_count: usize) -> Self {
let max_files = if file_count > 50_000 {
5_000
} else if file_count > 10_000 {
10_000
} else {
30_000 };
let max_bytes = if file_count > 50_000 {
128 * 1024 * 1024 } else if file_count > 10_000 {
256 * 1024 * 1024 } else {
MAX_CACHED_CONTENT_BYTES };
Self {
max_files,
max_bytes,
max_file_size: MAX_MMAP_FILE_SIZE,
cached_count: AtomicUsize::new(0),
cached_bytes: AtomicU64::new(0),
}
}
pub fn reset(&self) {
self.cached_count.store(0, Ordering::Relaxed);
self.cached_bytes.store(0, Ordering::Relaxed);
}
}
impl Default for ContentCacheBudget {
fn default() -> Self {
Self::new_for_repo(30_000)
}
}
pub struct BigramIndexBuilder {
lookup: Vec<AtomicU32>,
col_data: Vec<OnceLock<Box<[AtomicU64]>>>,
next_column: AtomicU32,
words: usize,
file_count: usize,
populated: AtomicUsize,
}
impl BigramIndexBuilder {
pub fn new(file_count: usize) -> Self {
let words = file_count.div_ceil(64);
let mut lookup = Vec::with_capacity(65536);
lookup.resize_with(65536, || AtomicU32::new(NO_COLUMN));
let mut col_data = Vec::with_capacity(MAX_BIGRAM_COLUMNS);
col_data.resize_with(MAX_BIGRAM_COLUMNS, OnceLock::new);
Self {
lookup,
col_data,
next_column: AtomicU32::new(0),
words,
file_count,
populated: AtomicUsize::new(0),
}
}
#[inline]
fn get_or_alloc_column(&self, key: u16) -> u32 {
let current = self.lookup[key as usize].load(Ordering::Relaxed);
if current != NO_COLUMN {
return current;
}
let new_col = self.next_column.fetch_add(1, Ordering::Relaxed);
if new_col >= MAX_BIGRAM_COLUMNS as u32 {
return NO_COLUMN;
}
match self.lookup[key as usize].compare_exchange(
NO_COLUMN,
new_col,
Ordering::Relaxed,
Ordering::Relaxed,
) {
Ok(_) => new_col,
Err(existing) => existing,
}
}
#[inline]
fn column_bitset(&self, col: u32) -> &[AtomicU64] {
let words = self.words;
self.col_data[col as usize].get_or_init(|| {
let mut v = Vec::with_capacity(words);
v.resize_with(words, || AtomicU64::new(0));
v.into_boxed_slice()
})
}
pub fn add_file_content(&self, file_idx: usize, content: &[u8]) {
if content.len() < 2 {
return;
}
debug_assert!(file_idx < self.file_count);
let word_idx = file_idx / 64;
let bit_mask = 1u64 << (file_idx % 64);
let mut prev = content[0];
for &b in &content[1..] {
if (32..=126).contains(&prev) && (32..=126).contains(&b) {
let key = (prev.to_ascii_lowercase() as u16) << 8 | b.to_ascii_lowercase() as u16;
let col = self.get_or_alloc_column(key);
if col != NO_COLUMN {
self.column_bitset(col)[word_idx].fetch_or(bit_mask, Ordering::Relaxed);
}
}
prev = b;
}
self.populated.fetch_add(1, Ordering::Relaxed);
}
pub fn add_file_content_skip(&self, file_idx: usize, content: &[u8]) {
if content.len() < 3 {
return;
}
debug_assert!(file_idx < self.file_count);
let word_idx = file_idx / 64;
let bit_mask = 1u64 << (file_idx % 64);
for i in 0..content.len() - 2 {
let a = content[i];
let b = content[i + 2];
if (32..=126).contains(&a) && (32..=126).contains(&b) {
let key = (a.to_ascii_lowercase() as u16) << 8 | b.to_ascii_lowercase() as u16;
let col = self.get_or_alloc_column(key);
if col != NO_COLUMN {
self.column_bitset(col)[word_idx].fetch_or(bit_mask, Ordering::Relaxed);
}
}
}
self.populated.fetch_add(1, Ordering::Relaxed);
}
pub fn is_ready(&self) -> bool {
self.populated.load(Ordering::Relaxed) > 0
}
pub fn columns_used(&self) -> u32 {
self.next_column
.load(Ordering::Relaxed)
.min(MAX_BIGRAM_COLUMNS as u32)
}
pub fn compress(self, min_density_pct: Option<u32>) -> BigramFilter {
let cols = self.columns_used() as usize;
let words = self.words;
let file_count = self.file_count;
let populated = self.populated.load(Ordering::Relaxed);
let dense_bytes = words * 8;
let old_lookup = self.lookup;
let mut col_data = self.col_data;
let mut lookup = vec![NO_COLUMN; 65536];
let mut dense_data: Vec<u64> = Vec::with_capacity(cols * words);
let mut dense_count: usize = 0;
for key in 0..65536u32 {
let old_col = old_lookup[key as usize].load(Ordering::Relaxed);
if old_col == NO_COLUMN || old_col as usize >= cols {
continue;
}
let Some(bitset) = col_data[old_col as usize].take() else {
continue;
};
let mut popcount = 0u32;
for w in 0..words {
popcount += bitset[w].load(Ordering::Relaxed).count_ones();
}
let sparse_ok = if let Some(min_pct) = min_density_pct {
populated > 0 && (popcount as usize) * 100 >= populated * min_pct as usize
} else {
(popcount as usize * 4) >= dense_bytes
};
if !sparse_ok {
continue;
}
if populated > 0 && (popcount as usize) * 10 >= populated * 9 {
continue;
}
let dense_idx = dense_count as u32;
lookup[key as usize] = dense_idx;
dense_count += 1;
for w in 0..words {
dense_data.push(bitset[w].load(Ordering::Relaxed));
}
}
drop(col_data);
drop(old_lookup);
BigramFilter {
lookup,
dense_data,
dense_count,
words,
file_count,
populated,
skip_index: None,
}
}
}
unsafe impl Send for BigramIndexBuilder {}
unsafe impl Sync for BigramIndexBuilder {}
#[derive(Debug)]
pub struct BigramFilter {
lookup: Vec<u32>,
dense_data: Vec<u64>,
dense_count: usize,
words: usize,
file_count: usize,
populated: usize,
skip_index: Option<Box<BigramFilter>>,
}
#[inline]
fn bitset_and(result: &mut [u64], bitset: &[u64]) {
result
.iter_mut()
.zip(bitset.iter())
.for_each(|(r, b)| *r &= *b);
}
impl BigramFilter {
pub fn query(&self, pattern: &[u8]) -> Option<Vec<u64>> {
if pattern.len() < 2 {
return None;
}
let mut result = vec![u64::MAX; self.words];
if !self.file_count.is_multiple_of(64) {
let last = self.words - 1;
result[last] = (1u64 << (self.file_count % 64)) - 1;
}
let words = self.words;
let mut has_filter = false;
let mut prev = pattern[0];
for &b in &pattern[1..] {
if (32..=126).contains(&prev) && (32..=126).contains(&b) {
let key = (prev.to_ascii_lowercase() as u16) << 8 | b.to_ascii_lowercase() as u16;
let col = self.lookup[key as usize];
if col != NO_COLUMN {
let offset = col as usize * words;
let slice = unsafe { self.dense_data.get_unchecked(offset..offset + words) };
bitset_and(&mut result, slice);
has_filter = true;
}
}
prev = b;
}
if let Some(skip) = &self.skip_index
&& pattern.len() >= 3
&& let Some(skip_candidates) = skip.query_skip(pattern)
{
bitset_and(&mut result, &skip_candidates);
has_filter = true;
}
has_filter.then_some(result)
}
fn query_skip(&self, pattern: &[u8]) -> Option<Vec<u64>> {
let mut result = vec![u64::MAX; self.words];
if !self.file_count.is_multiple_of(64) {
let last = self.words - 1;
result[last] = (1u64 << (self.file_count % 64)) - 1;
}
let words = self.words;
let mut has_filter = false;
for i in 0..pattern.len().saturating_sub(2) {
let a = pattern[i];
let b = pattern[i + 2];
if (32..=126).contains(&a) && (32..=126).contains(&b) {
let key = (a.to_ascii_lowercase() as u16) << 8 | b.to_ascii_lowercase() as u16;
let col = self.lookup[key as usize];
if col != NO_COLUMN {
let offset = col as usize * words;
let slice = unsafe { self.dense_data.get_unchecked(offset..offset + words) };
bitset_and(&mut result, slice);
has_filter = true;
}
}
}
has_filter.then_some(result)
}
pub fn set_skip_index(&mut self, skip: BigramFilter) {
self.skip_index = Some(Box::new(skip));
}
#[inline]
pub fn is_candidate(candidates: &[u64], file_idx: usize) -> bool {
let word = file_idx / 64;
let bit = file_idx % 64;
word < candidates.len() && candidates[word] & (1u64 << bit) != 0
}
pub fn count_candidates(candidates: &[u64]) -> usize {
candidates.iter().map(|w| w.count_ones() as usize).sum()
}
pub fn is_ready(&self) -> bool {
self.populated > 0
}
pub fn file_count(&self) -> usize {
self.file_count
}
pub fn columns_used(&self) -> usize {
self.dense_count
}
pub fn heap_bytes(&self) -> usize {
let lookup_bytes = self.lookup.len() * std::mem::size_of::<u32>();
let dense_bytes = self.dense_data.len() * std::mem::size_of::<u64>();
let skip_bytes = self.skip_index.as_ref().map_or(0, |s| s.heap_bytes());
lookup_bytes + dense_bytes + skip_bytes
}
pub fn has_key(&self, key: u16) -> bool {
self.lookup[key as usize] != NO_COLUMN
}
}
pub fn extract_bigrams(content: &[u8]) -> Vec<u16> {
if content.len() < 2 {
return Vec::new();
}
let mut seen = vec![0u64; 1024]; let mut bigrams = Vec::new();
let mut prev = content[0];
for &b in &content[1..] {
if (32..=126).contains(&prev) && (32..=126).contains(&b) {
let key = (prev.to_ascii_lowercase() as u16) << 8 | b.to_ascii_lowercase() as u16;
let word = key as usize / 64;
let bit = 1u64 << (key as usize % 64);
if seen[word] & bit == 0 {
seen[word] |= bit;
bigrams.push(key);
}
}
prev = b;
}
bigrams
}
#[derive(Debug)]
pub struct BigramOverlay {
modified: AHashMap<usize, Vec<u16>>,
tombstones: Vec<u64>,
added: Vec<Vec<u16>>,
base_file_count: usize,
}
impl BigramOverlay {
pub fn new(base_file_count: usize) -> Self {
let words = base_file_count.div_ceil(64);
Self {
modified: AHashMap::new(),
tombstones: vec![0u64; words],
added: Vec::new(),
base_file_count,
}
}
pub fn modify_file(&mut self, file_idx: usize, content: &[u8]) {
self.modified.insert(file_idx, extract_bigrams(content));
}
pub fn delete_file(&mut self, file_idx: usize) {
if file_idx < self.base_file_count {
let word = file_idx / 64;
self.tombstones[word] |= 1u64 << (file_idx % 64);
}
self.modified.remove(&file_idx);
}
pub fn add_file(&mut self, content: &[u8]) {
self.added.push(extract_bigrams(content));
}
pub fn query_modified(&self, pattern_bigrams: &[u16]) -> Vec<usize> {
if pattern_bigrams.is_empty() {
return self.modified.keys().copied().collect();
}
self.modified
.iter()
.filter_map(|(&file_idx, bigrams)| {
pattern_bigrams
.iter()
.all(|pb| bigrams.contains(pb))
.then_some(file_idx)
})
.collect()
}
pub fn query_added(&self, pattern_bigrams: &[u16]) -> Vec<usize> {
if pattern_bigrams.is_empty() {
return (0..self.added.len()).collect();
}
self.added
.iter()
.enumerate()
.filter_map(|(idx, bigrams)| {
pattern_bigrams
.iter()
.all(|pb| bigrams.contains(pb))
.then_some(idx)
})
.collect()
}
pub fn tombstones(&self) -> &[u64] {
&self.tombstones
}
pub fn is_tombstoned(&self, file_idx: usize) -> bool {
let word = file_idx / 64;
word < self.tombstones.len() && self.tombstones[word] & (1u64 << (file_idx % 64)) != 0
}
pub fn base_file_count(&self) -> usize {
self.base_file_count
}
pub fn remove_added(&mut self, idx: usize) {
if idx < self.added.len() {
self.added.remove(idx);
}
}
pub fn update_added(&mut self, idx: usize, bigrams: Vec<u16>) {
if idx < self.added.len() {
self.added[idx] = bigrams;
}
}
pub fn overlay_size(&self) -> usize {
self.modified.len()
+ self.added.len()
+ self
.tombstones
.iter()
.map(|w| w.count_ones() as usize)
.sum::<usize>()
}
}