pub mod batch_processor; pub mod branch; pub mod code_region_extractor; pub mod commits; pub mod contextual; pub mod differential_processor; pub mod file_processor; pub mod graph_optimization;
pub mod graphrag; pub mod languages; pub mod markdown_processor; pub mod search; pub mod signature_extractor;
pub mod render_utils;
pub use batch_processor::*;
pub use code_region_extractor::*;
pub use differential_processor::*;
pub use file_processor::*;
pub use graph_optimization::*;
pub use graphrag::*;
pub use languages::*;
pub use markdown_processor::*;
pub use search::*;
pub use signature_extractor::*;
use crate::config::Config;
use crate::mcp::logging::{log_file_processing_error, log_indexing_progress};
use crate::state;
use crate::state::SharedState;
#[cfg(test)]
use crate::store::DocumentBlock;
use crate::store::Store;
pub use render_utils::*;
mod file_utils;
pub mod git_utils;
mod path_utils;
mod text_processing;
use self::file_utils::FileUtils;
pub use self::git_utils::GitUtils;
pub use self::path_utils::PathUtils;
use crate::llm::LlmClient;
use std::fs;
use anyhow::Result;
use ignore;
use std::path::Path;
use std::collections::HashMap;
use std::sync::OnceLock;
pub struct NoindexWalker;
static NOINDEX_CACHE: OnceLock<parking_lot::RwLock<HashMap<std::path::PathBuf, bool>>> =
OnceLock::new();
impl NoindexWalker {
pub fn create_walker(current_dir: &Path) -> ignore::WalkBuilder {
let mut builder = ignore::WalkBuilder::new(current_dir);
builder
.hidden(true) .git_ignore(true) .git_global(true) .git_exclude(true) .follow_links(false);
if Self::has_noindex_files_cached(current_dir) {
builder.add_custom_ignore_filename(".noindex");
}
builder
}
fn has_noindex_files_cached(current_dir: &Path) -> bool {
let cache = NOINDEX_CACHE.get_or_init(|| parking_lot::RwLock::new(HashMap::new()));
let current_dir_buf = current_dir.to_path_buf();
{
let cache_read = cache.read();
if let Some(&cached_result) = cache_read.get(¤t_dir_buf) {
return cached_result;
}
}
let result = Self::has_noindex_files(current_dir);
{
let mut cache_write = cache.write();
cache_write.insert(current_dir_buf, result);
}
result
}
fn has_noindex_files(current_dir: &Path) -> bool {
if current_dir.join(".noindex").exists() {
return true;
}
let common_paths = [
"src",
"lib",
"tests",
"test",
"docs",
"doc",
"examples",
"example",
"target",
"build",
"dist",
"node_modules",
".git",
"vendor",
];
for subdir in &common_paths {
let noindex_path = current_dir.join(subdir).join(".noindex");
if noindex_path.exists() {
return true;
}
}
false
}
pub fn create_matcher(current_dir: &Path, quiet: bool) -> Result<ignore::gitignore::Gitignore> {
let mut builder = ignore::gitignore::GitignoreBuilder::new(current_dir);
let gitignore_path = current_dir.join(".gitignore");
if gitignore_path.exists() {
if let Some(e) = builder.add(&gitignore_path) {
if !quiet {
eprintln!("Warning: Failed to load .gitignore file: {}", e);
}
} }
let noindex_path = current_dir.join(".noindex");
if noindex_path.exists() {
if let Some(e) = builder.add(&noindex_path) {
if !quiet {
eprintln!("Warning: Failed to load .noindex file for matcher: {}", e);
}
} }
Ok(builder.build()?)
}
}
pub mod git {
use super::GitUtils;
use anyhow::Result;
use std::path::Path;
pub fn is_git_repo_root(path: &Path) -> bool {
GitUtils::is_git_repo_root(path)
}
pub fn find_git_root(start_path: &Path) -> Option<std::path::PathBuf> {
GitUtils::find_git_root(start_path)
}
pub fn get_current_commit_hash(repo_path: &Path) -> Result<String> {
GitUtils::get_current_commit_hash(repo_path)
}
pub fn get_changed_files_since_commit(
repo_path: &Path,
since_commit: &str,
) -> Result<Vec<String>> {
GitUtils::get_changed_files_since_commit(repo_path, since_commit)
}
pub fn get_all_changed_files(repo_path: &Path) -> Result<Vec<String>> {
GitUtils::get_all_changed_files(repo_path)
}
}
pub fn get_file_mtime(file_path: &std::path::Path) -> Result<u64> {
FileUtils::get_file_mtime(file_path)
}
pub fn detect_language(path: &std::path::Path) -> Option<&str> {
FileUtils::detect_language(path)
}
async fn cleanup_deleted_files_optimized(
store: &Store,
current_dir: &std::path::Path,
quiet: bool,
) -> Result<()> {
let indexed_files = store.get_all_indexed_file_paths().await?;
if indexed_files.is_empty() {
return Ok(());
}
let ignore_matcher = NoindexWalker::create_matcher(current_dir, quiet)?;
let mut files_to_remove = Vec::new();
let indexed_files_vec: Vec<String> = indexed_files.into_iter().collect();
const CHUNK_SIZE: usize = 100;
for chunk in indexed_files_vec.chunks(CHUNK_SIZE) {
for indexed_file in chunk {
let absolute_path = current_dir.join(indexed_file);
if !absolute_path.exists() {
files_to_remove.push(indexed_file.clone());
} else {
let is_ignored = ignore_matcher
.matched(&absolute_path, absolute_path.is_dir())
.is_ignore();
if is_ignored {
files_to_remove.push(indexed_file.clone());
}
}
}
if files_to_remove.len() >= CHUNK_SIZE {
for file_to_remove in &files_to_remove {
if let Err(e) = store.remove_blocks_by_path(file_to_remove).await {
if !quiet {
eprintln!(
"Warning: Failed to remove blocks for {}: {}",
file_to_remove, e
);
}
tracing::warn!(
file = %file_to_remove,
error = %e,
"Failed to remove blocks during cleanup"
);
}
}
files_to_remove.clear();
store.flush().await?;
}
}
if !files_to_remove.is_empty() {
for file_to_remove in &files_to_remove {
if let Err(e) = store.remove_blocks_by_path(file_to_remove).await {
if !quiet {
eprintln!(
"Warning: Failed to remove blocks for {}: {}",
file_to_remove, e
);
}
}
}
store.flush().await?;
}
Ok(())
}
async fn flush_if_needed(
store: &Store,
batches_processed: &mut usize,
config: &Config,
force: bool,
) -> Result<bool> {
if force || *batches_processed >= config.index.flush_frequency {
store.flush().await?;
*batches_processed = 0; Ok(true)
} else {
Ok(false)
}
}
fn fast_count_indexable_files(
current_dir: &Path,
git_changed_files: Option<&std::collections::HashSet<String>>,
) -> usize {
let mut count = 0;
if let Some(changed_files) = git_changed_files {
for file_path in changed_files {
let full_path = current_dir.join(file_path);
if !full_path.is_file() {
continue;
}
if detect_language(&full_path).is_some() || is_allowed_text_extension(&full_path) {
count += 1;
}
}
return count;
}
let walker = NoindexWalker::create_walker(current_dir).build();
for result in walker {
let entry = match result {
Ok(entry) => entry,
Err(_) => continue,
};
if !entry.file_type().is_some_and(|ft| ft.is_file()) {
continue;
}
if let Some(ext) = entry.path().extension() {
let ext_str = ext.to_str().unwrap_or("");
if matches!(
ext_str,
"rs" | "js"
| "ts" | "jsx" | "tsx"
| "py" | "go" | "java"
| "c" | "cpp" | "h"
| "hpp" | "cs" | "php"
| "rb" | "swift"
| "kt" | "scala"
| "r" | "m" | "mm"
| "md" | "markdown"
| "txt" | "json"
| "yaml" | "yml"
| "toml" | "xml"
| "html" | "css"
| "scss" | "sass"
| "less" | "sql"
| "sh" | "bash" | "zsh"
| "fish" | "vim"
| "lua" | "pl" | "pm"
| "t" | "pod" | "raku"
| "rakumod" | "rakudoc"
| "nix" | "dhall"
| "tf" | "tfvars"
| "hcl" | "vue" | "svelte"
| "elm" | "purs"
| "hs" | "lhs" | "ml"
| "mli" | "fs" | "fsi"
| "fsx" | "clj" | "cljs"
| "cljc" | "edn"
| "ex" | "exs" | "erl"
| "hrl" | "zig" | "v"
| "vsh" | "nim" | "nims"
| "cr" | "jl" | "d"
| "dart" | "pas"
| "pp" | "inc" | "asm"
| "s" | "S" | "rst"
| "adoc" | "tex"
| "bib" | "org" | "wiki"
| "pod6" | "rakutest"
| "cfg" | "conf"
| "config" | "ini"
| "env" | "properties"
| "gradle" | "cmake"
| "make" | "makefile"
| "dockerfile" | "containerfile"
| "vagrantfile" | "gemfile"
| "rakefile" | "guardfile"
| "podfile" | "fastfile"
| "brewfile"
) {
count += 1;
}
}
}
count
}
async fn persist_and_store_metadata(
store: &Store,
git_repo_root: Option<&Path>,
config: &Config,
quiet: bool,
context: &str,
) -> Result<()> {
if let Err(e) = store.flush().await {
tracing::error!(
context = context,
error = %e,
"Failed to flush store - metadata will NOT be stored"
);
return Err(e);
}
tracing::debug!(context = context, "Successfully flushed store");
let Some(git_root) = git_repo_root else {
return Ok(());
};
let current_commit = match git::get_current_commit_hash(git_root) {
Ok(hash) => hash,
Err(e) => {
tracing::warn!(
context = context,
error = %e,
"Could not get current commit hash, skipping metadata storage"
);
return Ok(()); }
};
if let Err(e) = store.store_git_metadata(¤t_commit).await {
tracing::error!(
context = context,
commit = %current_commit,
error = %e,
"Failed to store git metadata"
);
if !quiet {
eprintln!("Warning: Could not store git metadata: {}", e);
}
} else {
tracing::debug!(
context = context,
commit = %current_commit,
"Successfully stored git metadata"
);
}
if config.graphrag.enabled {
if let Err(e) = store.store_graphrag_commit_hash(¤t_commit).await {
tracing::error!(
context = context,
commit = %current_commit,
error = %e,
"Failed to store GraphRAG git metadata"
);
if !quiet {
eprintln!("Warning: Could not store GraphRAG git metadata: {}", e);
}
} else {
tracing::debug!(
context = context,
commit = %current_commit,
"Successfully stored GraphRAG metadata"
);
}
}
Ok(())
}
pub async fn index_files(
store: &Store,
state: SharedState,
config: &Config,
git_repo_root: Option<&Path>,
) -> Result<()> {
index_files_with_quiet(store, state, config, git_repo_root, false).await
}
pub async fn index_branch_delta(
branch_store: &Store,
state: SharedState,
config: &Config,
git_repo_root: &Path,
branch_name: &str,
quiet: bool,
) -> Result<branch::BranchManifest> {
let default_branch = GitUtils::get_default_branch(git_repo_root)?;
let current_dir = state.read().current_directory.clone();
let (branch_dir, existing_manifest) = branch::resolve_branch_state(¤t_dir, branch_name)?;
let force_rebuild = existing_manifest
.as_ref()
.map(|m| m.base_branch != default_branch)
.unwrap_or(false);
if force_rebuild && !quiet {
println!(
"⚠️ Base branch changed ({} → {}), rebuilding delta from scratch",
existing_manifest.as_ref().unwrap().base_branch,
default_branch
);
}
let (changed_files, deleted_files) =
branch::compute_branch_delta(git_repo_root, &default_branch)?;
if !quiet {
println!(
"🔀 Branch '{}' delta: {} changed, {} deleted files (vs '{}')",
branch_name,
changed_files.len(),
deleted_files.len(),
default_branch,
);
}
let branch_commit = branch::get_branch_commit(git_repo_root, "HEAD")?;
let base_commit = branch::get_branch_commit(git_repo_root, &default_branch)?;
if !force_rebuild {
if let Some(ref manifest) = existing_manifest {
if manifest.branch_commit == branch_commit
&& changed_files.is_empty()
&& deleted_files.is_empty()
{
if !quiet {
println!("✅ Branch delta unchanged, skipping reindex");
}
return Ok(manifest.clone());
}
}
}
let manifest_changed_paths = changed_files.clone();
let manifest_deleted_paths = deleted_files.clone();
let files_to_process: std::collections::HashSet<String> = if !force_rebuild {
if let Some(ref manifest) = existing_manifest {
let old_changed: std::collections::HashSet<&str> =
manifest.changed_paths.iter().map(|s| s.as_str()).collect();
let new_changed: std::collections::HashSet<&str> =
changed_files.iter().map(|s| s.as_str()).collect();
for path in old_changed.difference(&new_changed) {
if let Err(e) = branch_store.remove_blocks_by_path(path).await {
tracing::warn!(path = %path, error = %e, "Failed to remove reverted file from branch DB");
}
}
if manifest.branch_commit != branch_commit {
match git::get_changed_files_since_commit(git_repo_root, &manifest.branch_commit) {
Ok(since_last) => {
let since_set: std::collections::HashSet<String> =
since_last.into_iter().collect();
changed_files
.into_iter()
.filter(|f| {
since_set.contains(f.as_str()) || !old_changed.contains(f.as_str())
})
.collect()
}
Err(_) => changed_files.into_iter().collect(),
}
} else {
changed_files.into_iter().collect()
}
} else {
changed_files.into_iter().collect()
}
} else {
changed_files.into_iter().collect()
};
if files_to_process.is_empty() && !force_rebuild {
if !quiet {
println!("✅ No files need re-processing in branch delta");
}
} else {
for file_path in &files_to_process {
if let Err(e) = branch_store.remove_blocks_by_path(file_path).await {
tracing::warn!(path = %file_path, error = %e, "Failed to clean up branch block");
}
}
if !files_to_process.is_empty() {
branch_store.flush().await?;
}
{
let mut state_guard = state.write();
state_guard.total_files = files_to_process.len();
state_guard.counting_files = false;
state_guard.status_message = format!(
"Indexing branch delta ({} files)...",
files_to_process.len()
);
state_guard.graphrag_enabled = config.graphrag.enabled;
}
let branch_config = config.clone();
let mut code_blocks_batch = Vec::new();
let mut text_blocks_batch = Vec::new();
let mut document_blocks_batch = Vec::new();
let mut all_code_blocks = Vec::new();
let mut file_context_map = contextual::FileContextMap::new();
let mut code_file_metadata = FileMetadataBatch::new();
let mut text_file_metadata = FileMetadataBatch::new();
let mut document_file_metadata = FileMetadataBatch::new();
let mut embedding_calls = 0;
let mut batches_processed = 0;
let mut files_processed = 0;
for file_path in &files_to_process {
let full_path = current_dir.join(file_path);
if !full_path.is_file() {
continue;
}
if let Some(language) = detect_language(&full_path) {
match fs::read_to_string(&full_path) {
Ok(contents) => {
let ctx = ProcessFileContext {
store: branch_store,
config: &branch_config,
state: state.clone(),
};
if language == "markdown" {
process_markdown_file_differential(
branch_store,
&contents,
file_path,
&mut document_blocks_batch,
&branch_config,
state.clone(),
)
.await?;
} else {
process_file_differential(
&ctx,
&contents,
file_path,
language,
&mut code_blocks_batch,
&mut text_blocks_batch,
&mut all_code_blocks,
&mut file_context_map,
)
.await?;
}
if let Ok(actual_mtime) = get_file_mtime(&full_path) {
code_file_metadata.add(file_path, actual_mtime);
}
files_processed += 1;
state.write().indexed_files = files_processed;
if should_process_batch(&code_blocks_batch, |b| &b.content, &branch_config)
{
embedding_calls += code_blocks_batch.len();
match process_code_blocks_batch(
branch_store,
&code_blocks_batch,
&branch_config,
&code_file_metadata,
&file_context_map,
)
.await
{
Ok(()) => {
batches_processed += 1;
flush_if_needed(
branch_store,
&mut batches_processed,
&branch_config,
false,
)
.await?;
}
Err(e) => {
tracing::error!("Failed to process code blocks batch: {}", e);
}
}
code_blocks_batch.clear();
code_file_metadata.clear();
file_context_map.clear();
}
if should_process_batch(&text_blocks_batch, |b| &b.content, &branch_config)
{
embedding_calls += text_blocks_batch.len();
match process_text_blocks_batch(
branch_store,
&text_blocks_batch,
&branch_config,
&text_file_metadata,
)
.await
{
Ok(()) => {
batches_processed += 1;
flush_if_needed(
branch_store,
&mut batches_processed,
&branch_config,
false,
)
.await?;
}
Err(e) => {
tracing::error!("Failed to process text blocks batch: {}", e);
}
}
text_blocks_batch.clear();
text_file_metadata.clear();
}
if should_process_batch(
&document_blocks_batch,
|b| &b.content,
&branch_config,
) {
embedding_calls += document_blocks_batch.len();
match process_document_blocks_batch(
branch_store,
&document_blocks_batch,
&branch_config,
&document_file_metadata,
)
.await
{
Ok(()) => {
batches_processed += 1;
flush_if_needed(
branch_store,
&mut batches_processed,
&branch_config,
false,
)
.await?;
}
Err(e) => {
tracing::error!(
"Failed to process document blocks batch: {}",
e
);
}
}
document_blocks_batch.clear();
document_file_metadata.clear();
}
}
Err(e) => {
log_file_processing_error(file_path, "read_file", &e);
}
}
} else if is_allowed_text_extension(&full_path) && !is_markdown_file(&full_path) {
if let Ok(contents) = fs::read_to_string(&full_path) {
if is_text_file(&contents) {
process_text_file_differential(
branch_store,
&contents,
file_path,
&mut text_blocks_batch,
&branch_config,
state.clone(),
)
.await?;
if let Ok(actual_mtime) = get_file_mtime(&full_path) {
text_file_metadata.add(file_path, actual_mtime);
}
files_processed += 1;
state.write().indexed_files = files_processed;
if should_process_batch(&text_blocks_batch, |b| &b.content, &branch_config)
{
embedding_calls += text_blocks_batch.len();
process_text_blocks_batch(
branch_store,
&text_blocks_batch,
&branch_config,
&text_file_metadata,
)
.await?;
text_blocks_batch.clear();
text_file_metadata.clear();
batches_processed += 1;
flush_if_needed(
branch_store,
&mut batches_processed,
&branch_config,
false,
)
.await?;
}
}
}
}
}
if !code_blocks_batch.is_empty() {
if let Err(e) = process_code_blocks_batch(
branch_store,
&code_blocks_batch,
&branch_config,
&code_file_metadata,
&file_context_map,
)
.await
{
tracing::error!("Failed to process remaining code blocks batch: {}", e);
}
}
if !text_blocks_batch.is_empty() {
if let Err(e) = process_text_blocks_batch(
branch_store,
&text_blocks_batch,
&branch_config,
&text_file_metadata,
)
.await
{
tracing::error!("Failed to process remaining text blocks batch: {}", e);
}
}
if !document_blocks_batch.is_empty() {
if let Err(e) = process_document_blocks_batch(
branch_store,
&document_blocks_batch,
&branch_config,
&document_file_metadata,
)
.await
{
tracing::error!("Failed to process remaining document blocks batch: {}", e);
}
}
branch_store.flush().await?;
if config.graphrag.enabled && !all_code_blocks.is_empty() {
if !quiet {
println!("🔗 Building GraphRAG for branch delta...");
}
let graph_store = Store::new_for_branch(branch_name).await?;
let graph_builder = graphrag::GraphBuilder::new_with_store(
config.clone(),
¤t_dir,
graph_store,
quiet,
)
.await?;
graph_builder
.process_code_blocks(&all_code_blocks, Some(state.clone()))
.await?;
branch_store.flush().await?;
}
if !quiet {
println!(
"✓ Branch delta indexed: {} files processed ({} embeddings)",
files_processed, embedding_calls
);
}
}
{
let mut state_guard = state.write();
state_guard.indexing_complete = true;
}
let manifest = branch::BranchManifest {
version: 1,
branch_name: branch_name.to_string(),
base_branch: default_branch,
base_commit,
branch_commit,
changed_paths: manifest_changed_paths,
deleted_paths: manifest_deleted_paths,
indexed_at: chrono::Utc::now().timestamp(),
};
branch::save_manifest(&branch_dir, &manifest)?;
Ok(manifest)
}
fn validate_llm_structured_output(config: &Config, quiet: bool) -> Result<()> {
let mut models_to_check: Vec<(&str, &str)> = Vec::new();
if config.graphrag.enabled && config.graphrag.use_llm {
models_to_check.push((
&config.graphrag.llm.description_model,
"graphrag.llm.description_model",
));
}
if config.index.contextual_descriptions {
models_to_check.push((&config.index.contextual_model, "index.contextual_model"));
}
for (model_str, config_key) in models_to_check {
match LlmClient::with_model(config, model_str) {
Ok(client) => {
if !client.supports_structured_output() {
return Err(anyhow::anyhow!(
"Model '{}' ({}) does not support structured output. \
Choose a model that supports JSON schema (e.g. openai/gpt-4o-mini, gemini-2.0-flash).",
model_str,
config_key
));
}
}
Err(e) => {
if !quiet {
eprintln!(
"Warning: Could not validate model '{}' ({}): {}",
model_str, config_key, e
);
}
}
}
}
Ok(())
}
pub async fn index_files_with_quiet(
store: &Store,
state: SharedState,
config: &Config,
git_repo_root: Option<&Path>,
quiet: bool,
) -> Result<()> {
let current_dir = state.read().current_directory.clone();
let mut code_blocks_batch = Vec::new();
let mut text_blocks_batch = Vec::new();
let mut document_blocks_batch = Vec::new();
let mut all_code_blocks = Vec::new(); let mut file_context_map = contextual::FileContextMap::new();
let mut code_file_metadata = FileMetadataBatch::new();
let mut text_file_metadata = FileMetadataBatch::new();
let mut document_file_metadata = FileMetadataBatch::new();
let mut embedding_calls = 0;
let mut batches_processed = 0;
log_indexing_progress(
"indexing_start",
0,
0,
Some(¤t_dir.display().to_string()),
0,
);
validate_llm_structured_output(config, quiet)?;
{
let mut state_guard = state.write();
state_guard.graphrag_enabled = config.graphrag.enabled;
state_guard.graphrag_blocks = 0;
state_guard.counting_files = true;
state_guard.status_message = "Counting files...".to_string();
state_guard.quiet_mode = quiet;
}
let force_reindex = state.read().force_reindex;
let git_changed_files = if let Some(git_root) = git_repo_root {
if !force_reindex {
if let Ok(Some(last_commit)) = store.get_last_commit_hash().await {
if let Ok(current_commit) = git::get_current_commit_hash(git_root) {
if last_commit != current_commit {
match git::get_changed_files_since_commit(git_root, &last_commit) {
Ok(changed_files) => {
if !quiet {
println!(
"🚀 Git optimization: Commit changed, found {} files to reindex",
changed_files.len()
);
}
for file_path in &changed_files {
if let Err(e) = store.remove_blocks_by_path(file_path).await {
if !quiet {
eprintln!(
"Warning: Failed to clean up data for {}: {}",
file_path, e
);
}
}
}
if !changed_files.is_empty() {
store.flush().await?;
tracing::debug!(
files_cleaned = changed_files.len(),
"Flushed after cleanup of changed files"
);
}
Some(
changed_files
.into_iter()
.collect::<std::collections::HashSet<_>>(),
)
}
Err(e) => {
if !quiet {
eprintln!(
"Warning: Could not get git changes, indexing all files: {}",
e
);
}
None
}
}
} else {
if !quiet {
println!("✅ No commit changes since last index, skipping reindex");
}
{
let mut state_guard = state.write();
state_guard.status_message =
"Checking for deleted files...".to_string();
}
if let Err(e) =
cleanup_deleted_files_optimized(store, ¤t_dir, quiet).await
{
if !quiet {
eprintln!("Warning: Failed to clean up deleted files: {}", e);
}
tracing::warn!(
error = %e,
"Failed to clean up deleted files during same-commit path"
);
}
if config.graphrag.enabled {
let needs_graphrag_from_existing =
match store.graphrag_needs_indexing().await {
Ok(v) => v,
Err(e) => {
tracing::warn!(
error = %e,
"Failed to check if GraphRAG needs indexing, assuming false"
);
false
}
};
if needs_graphrag_from_existing {
if !quiet {
println!("🔗 Building GraphRAG from existing database...");
}
log_indexing_progress("graphrag_build", 0, 0, None, 0);
let graph_builder = graphrag::GraphBuilder::new_with_quiet(
config.clone(),
¤t_dir,
quiet,
)
.await?;
graph_builder
.build_from_existing_database(Some(state.clone()))
.await?;
persist_and_store_metadata(
store,
Some(git_root),
config,
quiet,
"graphrag_from_existing",
)
.await?;
}
}
if let Err(e) =
commits::index_commits(config, store, git_root, state.clone(), quiet)
.await
{
tracing::warn!("Commit indexing failed: {}", e);
}
{
let mut state_guard = state.write();
state_guard.indexing_complete = true;
}
return Ok(());
}
} else {
if !quiet {
println!("⚠️ Could not get current commit hash, indexing all files");
}
None
}
} else {
if !quiet {
println!("📋 First-time git indexing: indexing all files");
}
None
}
} else {
None
}
} else {
None
};
let should_cleanup_deleted_files = {
let force_reindex = state.read().force_reindex;
!force_reindex };
if should_cleanup_deleted_files {
{
let mut state_guard = state.write();
state_guard.status_message = "Checking for deleted files...".to_string();
}
log_indexing_progress("cleanup", 0, 0, None, 0);
if let Err(e) = cleanup_deleted_files_optimized(store, ¤t_dir, quiet).await {
if !quiet {
eprintln!("Warning: Cleanup failed: {}", e);
}
}
{
let mut state_guard = state.write();
state_guard.status_message = "".to_string();
}
}
{
let mut state_guard = state.write();
state_guard.status_message = "Loading file metadata...".to_string();
}
let file_metadata_map = store.get_all_file_metadata().await?;
if !quiet {
println!(
"📊 Loaded metadata for {} files from database",
file_metadata_map.len()
);
}
{
let mut state_guard = state.write();
state_guard.total_files = 0; state_guard.counting_files = true;
state_guard.status_message = "Starting indexing...".to_string();
}
let total_files_found;
let mut files_processed = 0;
log_indexing_progress("file_processing", 0, 0, None, 0);
if let Some(ref changed_files) = git_changed_files {
total_files_found = fast_count_indexable_files(¤t_dir, Some(changed_files));
{
let mut state_guard = state.write();
state_guard.total_files = total_files_found;
state_guard.counting_files = false;
state_guard.status_message = "".to_string();
}
for file_path in changed_files {
let full_path = current_dir.join(file_path);
if !full_path.is_file() {
continue;
}
let force_reindex = state.read().force_reindex;
if !force_reindex {
if let Ok(actual_mtime) = get_file_mtime(&full_path) {
if let Some(stored_mtime) = file_metadata_map.get(file_path) {
if actual_mtime <= *stored_mtime {
{
let mut state_guard = state.write();
state_guard.skipped_files += 1;
}
continue;
}
}
}
}
if let Some(language) = detect_language(&full_path) {
match fs::read_to_string(&full_path) {
Ok(contents) => {
let file_processed;
let ctx = ProcessFileContext {
store,
config,
state: state.clone(),
};
if language == "markdown" {
process_markdown_file_differential(
store,
&contents,
file_path,
&mut document_blocks_batch,
config,
state.clone(),
)
.await?;
file_processed = true;
} else {
process_file_differential(
&ctx,
&contents,
file_path,
language,
&mut code_blocks_batch,
&mut text_blocks_batch, &mut all_code_blocks,
&mut file_context_map,
)
.await?;
file_processed = true;
}
if file_processed {
if let Ok(actual_mtime) = get_file_mtime(&full_path) {
code_file_metadata.add(file_path, actual_mtime);
}
}
files_processed += 1;
state.write().indexed_files = files_processed;
if files_processed % 50 == 0 {
let current_total = state.read().total_files;
log_indexing_progress(
"file_processing",
files_processed,
current_total,
Some(file_path),
embedding_calls,
);
}
if should_process_batch(&code_blocks_batch, |b| &b.content, config) {
embedding_calls += code_blocks_batch.len();
match process_code_blocks_batch(
store,
&code_blocks_batch,
config,
&code_file_metadata,
&file_context_map,
)
.await
{
Ok(()) => {
batches_processed += 1;
flush_if_needed(store, &mut batches_processed, config, false)
.await?;
}
Err(e) => {
tracing::error!("Failed to process code blocks batch: {}", e);
}
}
code_blocks_batch.clear();
code_file_metadata.clear();
file_context_map.clear();
}
if should_process_batch(&text_blocks_batch, |b| &b.content, config) {
embedding_calls += text_blocks_batch.len();
match process_text_blocks_batch(
store,
&text_blocks_batch,
config,
&text_file_metadata,
)
.await
{
Ok(()) => {
batches_processed += 1;
flush_if_needed(store, &mut batches_processed, config, false)
.await?;
}
Err(e) => {
tracing::error!("Failed to process text blocks batch: {}", e);
}
}
text_blocks_batch.clear();
text_file_metadata.clear();
}
if should_process_batch(&document_blocks_batch, |b| &b.content, config) {
embedding_calls += document_blocks_batch.len();
match process_document_blocks_batch(
store,
&document_blocks_batch,
config,
&document_file_metadata,
)
.await
{
Ok(()) => {
batches_processed += 1;
flush_if_needed(store, &mut batches_processed, config, false)
.await?;
}
Err(e) => {
tracing::error!(
"Failed to process document blocks batch: {}",
e
);
}
}
document_blocks_batch.clear();
document_file_metadata.clear();
}
}
Err(e) => {
log_file_processing_error(file_path, "read_file", &e);
}
}
} else {
if is_allowed_text_extension(&full_path) && !is_markdown_file(&full_path) {
if let Ok(contents) = fs::read_to_string(&full_path) {
if is_text_file(&contents) {
process_text_file_differential(
store,
&contents,
file_path,
&mut text_blocks_batch,
config,
state.clone(),
)
.await?;
if let Ok(actual_mtime) = get_file_mtime(&full_path) {
text_file_metadata.add(file_path, actual_mtime);
}
files_processed += 1;
state.write().indexed_files = files_processed;
if files_processed % 50 == 0 {
let current_total = state.read().total_files;
log_indexing_progress(
"file_processing",
files_processed,
current_total,
Some(file_path),
embedding_calls,
);
}
if should_process_batch(&text_blocks_batch, |b| &b.content, config) {
embedding_calls += text_blocks_batch.len();
match process_text_blocks_batch(
store,
&text_blocks_batch,
config,
&text_file_metadata,
)
.await
{
Ok(()) => {
batches_processed += 1;
flush_if_needed(
store,
&mut batches_processed,
config,
false,
)
.await?;
}
Err(e) => {
tracing::error!(
"Failed to process text blocks batch: {}",
e
);
}
}
text_blocks_batch.clear();
text_file_metadata.clear();
}
}
}
}
}
}
} else {
total_files_found = fast_count_indexable_files(¤t_dir, None);
{
let mut state_guard = state.write();
state_guard.total_files = total_files_found;
state_guard.counting_files = false;
state_guard.status_message = "".to_string();
}
let walker = NoindexWalker::create_walker(¤t_dir).build();
for result in walker {
let entry = match result {
Ok(entry) => entry,
Err(_) => continue,
};
if !entry.file_type().is_some_and(|ft| ft.is_file()) {
continue;
}
let file_path = path_utils::PathUtils::to_relative_string(entry.path(), ¤t_dir);
let force_reindex = state.read().force_reindex;
if !force_reindex {
if let Ok(actual_mtime) = get_file_mtime(entry.path()) {
if let Some(stored_mtime) = file_metadata_map.get(&file_path) {
if actual_mtime <= *stored_mtime {
{
let mut state_guard = state.write();
state_guard.skipped_files += 1;
}
continue;
}
}
}
}
if let Some(language) = detect_language(entry.path()) {
match fs::read_to_string(entry.path()) {
Ok(contents) => {
let file_processed;
if language == "markdown" {
process_markdown_file_differential(
store,
&contents,
&file_path,
&mut document_blocks_batch,
config,
state.clone(),
)
.await?;
file_processed = true;
} else {
let ctx = ProcessFileContext {
store,
config,
state: state.clone(),
};
process_file_differential(
&ctx,
&contents,
&file_path,
language,
&mut code_blocks_batch,
&mut text_blocks_batch, &mut all_code_blocks,
&mut file_context_map,
)
.await?;
file_processed = true;
}
if file_processed {
if let Ok(actual_mtime) = get_file_mtime(entry.path()) {
code_file_metadata.add(&file_path, actual_mtime);
}
}
files_processed += 1;
state.write().indexed_files = files_processed;
if files_processed % 50 == 0 {
let current_total = state.read().total_files;
log_indexing_progress(
"file_processing",
files_processed,
current_total,
Some(&file_path),
embedding_calls,
);
}
if should_process_batch(&code_blocks_batch, |b| &b.content, config) {
embedding_calls += code_blocks_batch.len();
match process_code_blocks_batch(
store,
&code_blocks_batch,
config,
&code_file_metadata,
&file_context_map,
)
.await
{
Ok(()) => {
batches_processed += 1;
flush_if_needed(store, &mut batches_processed, config, false)
.await?;
}
Err(e) => {
tracing::error!("Failed to process code blocks batch: {}", e);
}
}
code_blocks_batch.clear();
code_file_metadata.clear();
file_context_map.clear();
}
if should_process_batch(&text_blocks_batch, |b| &b.content, config) {
embedding_calls += text_blocks_batch.len();
match process_text_blocks_batch(
store,
&text_blocks_batch,
config,
&text_file_metadata,
)
.await
{
Ok(()) => {
batches_processed += 1;
flush_if_needed(store, &mut batches_processed, config, false)
.await?;
}
Err(e) => {
tracing::error!("Failed to process text blocks batch: {}", e);
}
}
text_blocks_batch.clear();
text_file_metadata.clear();
}
if should_process_batch(&document_blocks_batch, |b| &b.content, config) {
embedding_calls += document_blocks_batch.len();
match process_document_blocks_batch(
store,
&document_blocks_batch,
config,
&document_file_metadata,
)
.await
{
Ok(()) => {
batches_processed += 1;
flush_if_needed(store, &mut batches_processed, config, false)
.await?;
}
Err(e) => {
tracing::error!(
"Failed to process document blocks batch: {}",
e
);
}
}
document_blocks_batch.clear();
document_file_metadata.clear();
}
}
Err(e) => {
log_file_processing_error(&file_path, "read_file", &e);
}
}
} else {
if is_allowed_text_extension(entry.path()) && !is_markdown_file(entry.path()) {
if let Ok(contents) = fs::read_to_string(entry.path()) {
if is_text_file(&contents) {
process_text_file_differential(
store,
&contents,
&file_path,
&mut text_blocks_batch,
config,
state.clone(),
)
.await?;
if let Ok(actual_mtime) = get_file_mtime(entry.path()) {
text_file_metadata.add(&file_path, actual_mtime);
}
files_processed += 1;
state.write().indexed_files = files_processed;
if files_processed % 50 == 0 {
let current_total = state.read().total_files;
log_indexing_progress(
"file_processing",
files_processed,
current_total,
Some(&file_path),
embedding_calls,
);
}
if should_process_batch(&text_blocks_batch, |b| &b.content, config) {
embedding_calls += text_blocks_batch.len();
match process_text_blocks_batch(
store,
&text_blocks_batch,
config,
&text_file_metadata,
)
.await
{
Ok(()) => {
batches_processed += 1;
flush_if_needed(
store,
&mut batches_processed,
config,
false,
)
.await?;
}
Err(e) => {
tracing::error!(
"Failed to process text blocks batch: {}",
e
);
}
}
text_blocks_batch.clear();
text_file_metadata.clear();
}
}
}
}
}
}
}
if !code_blocks_batch.is_empty() {
embedding_calls += code_blocks_batch.len();
match process_code_blocks_batch(
store,
&code_blocks_batch,
config,
&code_file_metadata,
&file_context_map,
)
.await
{
Ok(()) => {
batches_processed += 1;
}
Err(e) => {
tracing::error!("Failed to process remaining code blocks batch: {}", e);
}
}
}
if !text_blocks_batch.is_empty() {
embedding_calls += text_blocks_batch.len();
match process_text_blocks_batch(store, &text_blocks_batch, config, &text_file_metadata)
.await
{
Ok(()) => {
batches_processed += 1;
}
Err(e) => {
tracing::error!("Failed to process remaining text blocks batch: {}", e);
}
}
}
if !document_blocks_batch.is_empty() {
embedding_calls += document_blocks_batch.len();
match process_document_blocks_batch(
store,
&document_blocks_batch,
config,
&document_file_metadata,
)
.await
{
Ok(()) => {
batches_processed += 1;
}
Err(e) => {
tracing::error!("Failed to process remaining document blocks batch: {}", e);
}
}
}
flush_if_needed(store, &mut batches_processed, config, true).await?;
if config.graphrag.enabled {
let needs_graphrag_from_existing = if all_code_blocks.is_empty() {
let existing_blocks = store
.get_all_code_blocks_for_graphrag()
.await
.unwrap_or_default();
let needs_indexing = match store.graphrag_needs_indexing().await {
Ok(v) => v,
Err(e) => {
tracing::warn!(
error = %e,
"Failed to check if GraphRAG needs indexing, assuming false"
);
false
}
};
!existing_blocks.is_empty() && needs_indexing
} else {
false };
if !all_code_blocks.is_empty() || needs_graphrag_from_existing {
{
let mut state_guard = state.write();
if needs_graphrag_from_existing {
state_guard.status_message =
"Building GraphRAG from existing database...".to_string();
} else {
state_guard.status_message = "Building GraphRAG knowledge graph...".to_string();
}
}
log_indexing_progress(
"graphrag_build",
state.read().indexed_files,
state.read().total_files,
None,
embedding_calls,
);
let graph_builder =
graphrag::GraphBuilder::new_with_quiet(config.clone(), ¤t_dir, quiet).await?;
if needs_graphrag_from_existing {
graph_builder
.build_from_existing_database(Some(state.clone()))
.await?;
} else {
graph_builder
.process_code_blocks(&all_code_blocks, Some(state.clone()))
.await?;
}
{
let mut state_guard = state.write();
state_guard.status_message = "".to_string();
}
}
}
{
let mut state_guard = state.write();
state_guard.indexing_complete = true;
state_guard.embedding_calls = embedding_calls;
}
let final_files = state.read().indexed_files;
let final_total = state.read().total_files;
log_indexing_progress(
"indexing_complete",
final_files,
final_total,
None,
embedding_calls,
);
if let Some(repo_root) = git_repo_root {
if let Err(e) = commits::index_commits(config, store, repo_root, state.clone(), quiet).await
{
tracing::warn!("Commit indexing failed: {}", e);
}
}
persist_and_store_metadata(store, git_repo_root, config, quiet, "indexing_complete").await?;
Ok(())
}
pub async fn handle_file_change(store: &Store, file_path: &str, config: &Config) -> Result<()> {
let state = state::create_shared_state();
{
let mut state_guard = state.write();
state_guard.graphrag_enabled = config.graphrag.enabled;
state_guard.graphrag_blocks = 0;
}
store.remove_blocks_by_path(file_path).await?;
let path = std::path::Path::new(file_path);
if path.exists() {
let current_dir = std::env::current_dir()?;
let absolute_path = if path.is_absolute() {
path.to_path_buf()
} else {
current_dir.join(path)
};
if let Ok(matcher) = NoindexWalker::create_matcher(¤t_dir, true) {
if matcher
.matched(&absolute_path, absolute_path.is_dir())
.is_ignore()
{
return Ok(());
}
}
if let Some(language) = detect_language(&absolute_path) {
if let Ok(contents) = fs::read_to_string(&absolute_path) {
let relative_file_path =
path_utils::PathUtils::to_relative_string(&absolute_path, ¤t_dir);
if language == "markdown" {
let mut document_blocks_batch = Vec::new();
let mut document_file_metadata = FileMetadataBatch::new();
process_markdown_file(
store,
&contents,
&relative_file_path,
&mut document_blocks_batch,
config,
state.clone(),
)
.await?;
if let Ok(mtime) = get_file_mtime(&absolute_path) {
document_file_metadata.add(&relative_file_path, mtime);
}
if !document_blocks_batch.is_empty() {
process_document_blocks_batch(
store,
&document_blocks_batch,
config,
&document_file_metadata,
)
.await?;
}
} else {
let mut code_blocks_batch = Vec::new();
let mut text_blocks_batch = Vec::new(); let mut all_code_blocks = Vec::new(); let mut code_file_metadata = FileMetadataBatch::new();
let mut file_context_map = contextual::FileContextMap::new();
let ctx = ProcessFileContext {
store,
config,
state: state.clone(),
};
process_file(
&ctx,
&contents,
&relative_file_path,
language,
&mut code_blocks_batch,
&mut text_blocks_batch,
&mut all_code_blocks,
&mut file_context_map,
)
.await?;
if let Ok(mtime) = get_file_mtime(&absolute_path) {
code_file_metadata.add(&relative_file_path, mtime);
}
if !code_blocks_batch.is_empty() {
process_code_blocks_batch(
store,
&code_blocks_batch,
config,
&code_file_metadata,
&file_context_map,
)
.await?;
}
if config.graphrag.enabled && !all_code_blocks.is_empty() {
let graph_builder =
graphrag::GraphBuilder::new(config.clone(), ¤t_dir).await?;
graph_builder
.process_code_blocks(&all_code_blocks, Some(state.clone()))
.await?;
}
}
store.flush().await?;
}
} else {
if is_allowed_text_extension(&absolute_path) {
if let Ok(contents) = fs::read_to_string(&absolute_path) {
if is_text_file(&contents) {
let relative_file_path =
path_utils::PathUtils::to_relative_string(&absolute_path, ¤t_dir);
let mut text_blocks_batch = Vec::new();
process_text_file(
store,
&contents,
&relative_file_path,
&mut text_blocks_batch,
config,
state.clone(),
)
.await?;
let mut text_file_metadata = FileMetadataBatch::new();
if let Ok(mtime) = get_file_mtime(&absolute_path) {
text_file_metadata.add(&relative_file_path, mtime);
}
if !text_blocks_batch.is_empty() {
process_text_blocks_batch(
store,
&text_blocks_batch,
config,
&text_file_metadata,
)
.await?;
}
store.flush().await?;
}
}
}
}
}
Ok(())
}
#[cfg(test)]
mod context_optimization_tests {
use super::*;
#[test]
fn test_context_optimization() {
let doc_block = DocumentBlock {
path: "test.md".to_string(),
title: "Test Section".to_string(),
content: "This is the actual content.".to_string(),
context: vec![
"# Main Document".to_string(),
"## Authentication".to_string(),
"### JWT Implementation".to_string(),
],
level: 3,
start_line: 10,
end_line: 15,
hash: "test_hash".to_string(),
distance: None,
};
let embedding_text = if !doc_block.context.is_empty() {
format!("{}\n\n{}", doc_block.context.join("\n"), doc_block.content)
} else {
doc_block.content.clone()
};
assert!(embedding_text.contains("# Main Document"));
assert!(embedding_text.contains("## Authentication"));
assert!(embedding_text.contains("### JWT Implementation"));
assert!(embedding_text.contains("This is the actual content."));
let storage_size = doc_block.content.len();
let context_size: usize = doc_block.context.iter().map(|s| s.len()).sum();
let total_size = storage_size + context_size;
let old_approach_size = embedding_text.len() + doc_block.content.len();
assert!(total_size < old_approach_size);
println!("New approach size: {} bytes", total_size);
println!("Old approach size: {} bytes", old_approach_size);
println!(
"Memory savings: {}%",
((old_approach_size - total_size) as f64 / old_approach_size as f64 * 100.0) as i32
);
}
#[test]
fn test_empty_context() {
let doc_block = DocumentBlock {
path: "test.md".to_string(),
title: "Test Section".to_string(),
content: "Content without context.".to_string(),
context: Vec::new(), level: 1,
start_line: 0,
end_line: 5,
hash: "test_hash".to_string(),
distance: None,
};
let embedding_text = if !doc_block.context.is_empty() {
format!("{}\n\n{}", doc_block.context.join("\n"), doc_block.content)
} else {
doc_block.content.clone()
};
assert_eq!(embedding_text, doc_block.content);
}
#[test]
fn test_smart_chunking_eliminates_tiny_chunks() {
let test_content = r#"# Main Document
## Section A
Some content here.
### Tiny Subsection
Only 33 symbols here - very small!
### Another Tiny
Also small content.
## Section B
This has more substantial content that should be fine on its own.
It has multiple lines and provides good context for understanding.
### Small Child
Brief content.
"#;
let hierarchy = parse_document_hierarchy(test_content);
let chunks = hierarchy.bottom_up_chunking(2000);
let tiny_chunks: Vec<&ChunkResult> = chunks
.iter()
.filter(|chunk| chunk.storage_content.len() < 100)
.collect();
println!("Generated {} chunks total", chunks.len());
for (i, chunk) in chunks.iter().enumerate() {
println!(
"Chunk {}: {} chars - '{}'",
i + 1,
chunk.storage_content.len(),
chunk.title
);
}
if !tiny_chunks.is_empty() {
println!("Found {} tiny chunks:", tiny_chunks.len());
for chunk in &tiny_chunks {
println!("- '{}': {} chars", chunk.title, chunk.storage_content.len());
}
}
assert!(
tiny_chunks.len() <= 1,
"Should have at most 1 tiny chunk after smart merging"
);
}
}