use parking_lot::Mutex;
use serde::{Deserialize, Serialize};
use std::cmp::Reverse;
use std::collections::BinaryHeap;
use std::num::NonZero;
use std::path::Path;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use tokio::sync::RwLock;
pub struct FileIndex {
files: Vec<String>,
directories: Vec<String>,
last_built: std::time::Instant,
}
fn build_parallel_walker(
search_directory: &Path,
exclude: &[String],
threads: usize,
respect_gitignore: bool,
) -> anyhow::Result<ignore::WalkParallel> {
let mut walk_builder = ignore::WalkBuilder::new(search_directory);
vtcode_commons::walk::apply_defaults(&mut walk_builder);
walk_builder.threads(threads);
walk_builder.follow_links(true); walk_builder.require_git(false);
if !respect_gitignore {
walk_builder
.git_ignore(false)
.git_global(false)
.git_exclude(false)
.ignore(false)
.parents(false);
}
if !exclude.is_empty() {
let mut override_builder = ignore::overrides::OverrideBuilder::new(search_directory);
for exclude_pattern in exclude {
let pattern = format!("!{}", exclude_pattern);
override_builder.add(&pattern)?;
}
walk_builder.overrides(override_builder.build()?);
}
Ok(walk_builder.build_parallel())
}
impl FileIndex {
fn build_from_directory(
search_directory: &Path,
exclude: &[String],
respect_gitignore: bool,
threads: usize,
) -> anyhow::Result<Self> {
let walker = build_parallel_walker(search_directory, exclude, threads, respect_gitignore)?;
let files_arc = Arc::new(Mutex::new(Vec::new()));
let dirs_arc = Arc::new(Mutex::new(Vec::new()));
walker.run(|| {
let files_clone = files_arc.clone();
let dirs_clone = dirs_arc.clone();
let search_dir = search_directory.to_path_buf();
Box::new(move |result| {
let entry = match result {
Ok(e) => e,
Err(_) => return ignore::WalkState::Continue,
};
if let Some(rel_path) = entry
.path()
.strip_prefix(&search_dir)
.ok()
.and_then(|p| p.to_str())
&& !rel_path.is_empty()
{
if entry.path().is_dir() {
dirs_clone.lock().push(rel_path.to_string());
} else {
files_clone.lock().push(rel_path.to_string());
}
}
ignore::WalkState::Continue
})
});
let files = Arc::try_unwrap(files_arc)
.map_err(|arc| {
anyhow::anyhow!(
"failed to unwrap files arc, {} references remain",
Arc::strong_count(&arc)
)
})?
.into_inner();
let directories = Arc::try_unwrap(dirs_arc)
.map_err(|arc| {
anyhow::anyhow!(
"failed to unwrap dirs arc, {} references remain",
Arc::strong_count(&arc)
)
})?
.into_inner();
Ok(Self {
files,
directories,
last_built: std::time::Instant::now(),
})
}
fn query(
&self,
pattern_text: &str,
limit: usize,
match_type_filter: Option<MatchType>,
) -> Vec<(u32, String, MatchType)> {
let mut results = BinaryHeap::with_capacity(limit);
let pattern_storage = if pattern_text.is_ascii() {
PatternStorage::Ascii(pattern_text.to_ascii_lowercase().into_bytes())
} else {
PatternStorage::Unicode(pattern_text.to_lowercase().chars().collect())
};
let mut matcher = nucleo_matcher::Matcher::new(nucleo_matcher::Config::DEFAULT);
let mut haystack_buf = Vec::with_capacity(256);
if match_type_filter.is_none_or(|t| t == MatchType::File) {
for path in &self.files {
if let Some(score) =
self.score_path(path, &pattern_storage, &mut matcher, &mut haystack_buf)
{
push_top_match(&mut results, limit, score, path.clone(), MatchType::File);
}
}
}
if match_type_filter.is_none_or(|t| t == MatchType::Directory) {
for path in &self.directories {
if let Some(score) =
self.score_path(path, &pattern_storage, &mut matcher, &mut haystack_buf)
{
push_top_match(
&mut results,
limit,
score,
path.clone(),
MatchType::Directory,
);
}
}
}
results
.into_sorted_vec()
.into_iter()
.map(|Reverse(item)| item)
.collect()
}
fn score_path(
&self,
path: &str,
pattern: &PatternStorage,
matcher: &mut nucleo_matcher::Matcher,
haystack_buf: &mut Vec<char>,
) -> Option<u32> {
let haystack = nucleo_matcher::Utf32Str::new(path, haystack_buf);
let needle = match pattern {
PatternStorage::Ascii(bytes) => nucleo_matcher::Utf32Str::Ascii(bytes),
PatternStorage::Unicode(chars) => nucleo_matcher::Utf32Str::Unicode(chars),
};
matcher.fuzzy_match(haystack, needle).map(|s| s as u32)
}
}
pub struct FileIndexCache {
cache: Arc<RwLock<Option<Arc<FileIndex>>>>,
search_directory: std::path::PathBuf,
exclude: Vec<String>,
respect_gitignore: bool,
threads: usize,
}
impl FileIndexCache {
pub fn new(
search_directory: std::path::PathBuf,
exclude: impl IntoIterator<Item = String>,
respect_gitignore: bool,
threads: usize,
) -> Self {
Self {
cache: Arc::new(RwLock::new(None)),
search_directory,
exclude: exclude.into_iter().collect(),
respect_gitignore,
threads,
}
}
pub async fn get_or_build(&self) -> anyhow::Result<Arc<FileIndex>> {
{
let guard = self.cache.read().await;
if let Some(index) = guard.as_ref() {
if index.last_built.elapsed() < std::time::Duration::from_secs(300) {
return Ok(Arc::clone(index));
}
}
}
let index = Arc::new(FileIndex::build_from_directory(
&self.search_directory,
&self.exclude,
self.respect_gitignore,
self.threads,
)?);
{
let mut guard = self.cache.write().await;
*guard = Some(Arc::clone(&index));
}
Ok(index)
}
pub fn refresh_background(&self) -> Option<Arc<FileIndex>> {
let search_directory = self.search_directory.clone();
let exclude = self.exclude.clone();
let respect_gitignore = self.respect_gitignore;
let threads = self.threads;
let cache = self.cache.clone();
tokio::spawn(async move {
match FileIndex::build_from_directory(
&search_directory,
&exclude,
respect_gitignore,
threads,
) {
Ok(new_index) => {
let mut guard = cache.write().await;
*guard = Some(Arc::new(new_index));
}
Err(e) => {
tracing::error!("failed to rebuild file index: {e}");
}
}
});
let guard = self.cache.blocking_read();
guard.as_ref().map(Arc::clone)
}
pub fn update_file(&self, path: &str, is_added: bool) {
let mut guard = self.cache.blocking_write();
let Some(existing) = guard.take() else { return };
let mut index = Arc::try_unwrap(existing).unwrap_or_else(|arc| (*arc).clone());
if is_added {
if Path::new(path).is_dir() {
index.directories.push(path.to_string());
} else {
index.files.push(path.to_string());
}
} else {
index.files.retain(|p| p != path);
index.directories.retain(|p| p != path);
}
index.last_built = std::time::Instant::now();
*guard = Some(Arc::new(index));
}
pub async fn index_age(&self) -> Option<std::time::Duration> {
let guard = self.cache.read().await;
guard.as_ref().map(|idx| idx.last_built.elapsed())
}
}
impl Clone for FileIndex {
fn clone(&self) -> Self {
Self {
files: self.files.clone(),
directories: self.directories.clone(),
last_built: self.last_built,
}
}
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
#[serde(rename_all = "lowercase")]
pub enum MatchType {
File,
Directory,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FileMatch {
pub score: u32,
pub path: String,
pub match_type: MatchType,
#[serde(skip_serializing_if = "Option::is_none")]
pub indices: Option<Vec<u32>>,
}
#[derive(Debug)]
pub struct FileSearchResults {
pub matches: Vec<FileMatch>,
pub total_match_count: usize,
}
pub struct FileSearchConfig {
pub pattern_text: String,
pub limit: NonZero<usize>,
pub search_directory: std::path::PathBuf,
pub exclude: Vec<String>,
pub threads: NonZero<usize>,
pub cancel_flag: Arc<AtomicBool>,
pub compute_indices: bool,
pub respect_gitignore: bool,
}
pub use vtcode_commons::paths::file_name_from_path;
struct BestMatchesList {
matches: BinaryHeap<Reverse<(u32, String, MatchType)>>,
limit: usize,
matcher: nucleo_matcher::Matcher,
haystack_buf: Vec<char>,
pattern: PatternStorage,
}
enum PatternStorage {
Ascii(Vec<u8>),
Unicode(Vec<char>),
}
impl BestMatchesList {
fn new(limit: usize, pattern_text: &str) -> Self {
let pattern = if pattern_text.is_ascii() {
PatternStorage::Ascii(pattern_text.to_ascii_lowercase().into_bytes())
} else {
PatternStorage::Unicode(pattern_text.to_lowercase().chars().collect())
};
Self {
matches: BinaryHeap::new(),
limit,
matcher: nucleo_matcher::Matcher::new(nucleo_matcher::Config::DEFAULT),
haystack_buf: Vec::with_capacity(256),
pattern,
}
}
fn record_match(&mut self, path: &str, match_type: MatchType) -> bool {
let haystack = nucleo_matcher::Utf32Str::new(path, &mut self.haystack_buf);
let needle = match &self.pattern {
PatternStorage::Ascii(bytes) => nucleo_matcher::Utf32Str::Ascii(bytes),
PatternStorage::Unicode(chars) => nucleo_matcher::Utf32Str::Unicode(chars),
};
let Some(score) = self.matcher.fuzzy_match(haystack, needle) else {
return false;
};
push_top_match(
&mut self.matches,
self.limit,
score as u32,
path.to_string(),
match_type,
);
true
}
}
fn push_top_match(
matches: &mut BinaryHeap<Reverse<(u32, String, MatchType)>>,
limit: usize,
score: u32,
path: String,
match_type: MatchType,
) -> bool {
if matches.len() < limit {
matches.push(Reverse((score, path, match_type)));
return true;
}
let Some(min_score) = matches.peek().map(|entry| entry.0.0) else {
return false;
};
if score <= min_score {
return false;
}
matches.pop();
matches.push(Reverse((score, path, match_type)));
true
}
pub async fn run_with_index(
config: FileSearchConfig,
index_cache: &FileIndexCache,
) -> anyhow::Result<FileSearchResults> {
let limit = config.limit.get();
let cancel_flag = &config.cancel_flag;
let compute_indices = config.compute_indices;
let index = index_cache.get_or_build().await?;
if cancel_flag.load(Ordering::Relaxed) {
return Ok(FileSearchResults {
matches: Vec::new(),
total_match_count: 0,
});
}
let matched_paths = index.query(&config.pattern_text, limit, None);
let total_match_count = matched_paths.len();
let matches = matched_paths
.into_iter()
.map(|(score, path, match_type)| FileMatch {
score,
path,
match_type,
indices: if compute_indices {
Some(Vec::new())
} else {
None
},
})
.collect();
Ok(FileSearchResults {
matches,
total_match_count,
})
}
pub fn run(config: FileSearchConfig) -> anyhow::Result<FileSearchResults> {
let limit = config.limit.get();
let search_directory = &config.search_directory;
let exclude = &config.exclude;
let threads = config.threads.get();
let cancel_flag = &config.cancel_flag;
let compute_indices = config.compute_indices;
let respect_gitignore = config.respect_gitignore;
let walker = build_parallel_walker(search_directory, exclude, threads, respect_gitignore)?;
let best_matchers_per_worker: Vec<Arc<Mutex<BestMatchesList>>> = (0..threads)
.map(|_| {
Arc::new(Mutex::new(BestMatchesList::new(
limit,
&config.pattern_text,
)))
})
.collect();
let total_match_count = Arc::new(AtomicUsize::new(0));
let worker_counter = AtomicUsize::new(0);
let worker_count = best_matchers_per_worker.len();
walker.run(|| {
let worker_id = worker_counter.fetch_add(1, Ordering::Relaxed) % worker_count;
let best_list = best_matchers_per_worker[worker_id].clone();
let cancel_flag_clone = cancel_flag.clone();
let total_match_count_clone = total_match_count.clone();
Box::new(move |result| {
if cancel_flag_clone.load(Ordering::Relaxed) {
return ignore::WalkState::Quit;
}
let entry = match result {
Ok(e) => e,
Err(_) => return ignore::WalkState::Continue,
};
let relative_path = entry
.path()
.strip_prefix(search_directory)
.ok()
.and_then(|p| p.to_str());
let path_to_match = match relative_path {
Some(p) if !p.is_empty() => p,
_ => return ignore::WalkState::Continue, };
let match_type = if entry.path().is_dir() {
MatchType::Directory
} else {
MatchType::File
};
{
let mut list = best_list.lock();
if list.record_match(path_to_match, match_type) {
total_match_count_clone.fetch_add(1, Ordering::Relaxed);
}
}
ignore::WalkState::Continue
})
});
let mut merged_matches = BinaryHeap::with_capacity(limit);
for arc in best_matchers_per_worker {
let mut list = arc.lock();
for Reverse((score, path, match_type)) in std::mem::take(&mut list.matches).into_vec() {
push_top_match(&mut merged_matches, limit, score, path, match_type);
}
}
let matches = merged_matches
.into_sorted_vec()
.into_iter()
.map(|Reverse((score, path, match_type))| FileMatch {
score,
path,
match_type,
indices: if compute_indices {
Some(Vec::new())
} else {
None
},
})
.collect();
Ok(FileSearchResults {
matches,
total_match_count: total_match_count.load(Ordering::Relaxed),
})
}