pub mod batch_processor; pub mod code_region_extractor; pub mod differential_processor; pub mod file_processor; pub mod graph_optimization;
pub mod graphrag; pub mod languages; pub mod markdown_processor; pub mod search; pub mod signature_extractor;
pub mod render_utils;
pub use batch_processor::*;
pub use code_region_extractor::*;
pub use differential_processor::*;
pub use file_processor::*;
pub use graph_optimization::*;
pub use graphrag::*;
pub use languages::*;
pub use markdown_processor::*;
pub use search::*;
pub use signature_extractor::*;
use crate::config::Config;
use crate::mcp::logging::{log_file_processing_error, log_indexing_progress};
use crate::state;
use crate::state::SharedState;
#[cfg(test)]
use crate::store::DocumentBlock;
use crate::store::Store;
pub use render_utils::*;
mod file_utils;
pub mod git_utils;
mod path_utils;
mod text_processing;
use self::file_utils::FileUtils;
pub use self::git_utils::GitUtils;
pub use self::path_utils::PathUtils;
use std::fs;
use anyhow::Result;
use ignore;
use std::path::Path;
use std::collections::HashMap;
use std::sync::OnceLock;
pub struct NoindexWalker;
static NOINDEX_CACHE: OnceLock<parking_lot::RwLock<HashMap<std::path::PathBuf, bool>>> =
OnceLock::new();
impl NoindexWalker {
pub fn create_walker(current_dir: &Path) -> ignore::WalkBuilder {
let mut builder = ignore::WalkBuilder::new(current_dir);
builder
.hidden(true) .git_ignore(true) .git_global(true) .git_exclude(true) .follow_links(false);
if Self::has_noindex_files_cached(current_dir) {
builder.add_custom_ignore_filename(".noindex");
}
builder
}
fn has_noindex_files_cached(current_dir: &Path) -> bool {
let cache = NOINDEX_CACHE.get_or_init(|| parking_lot::RwLock::new(HashMap::new()));
let current_dir_buf = current_dir.to_path_buf();
{
let cache_read = cache.read();
if let Some(&cached_result) = cache_read.get(¤t_dir_buf) {
return cached_result;
}
}
let result = Self::has_noindex_files(current_dir);
{
let mut cache_write = cache.write();
cache_write.insert(current_dir_buf, result);
}
result
}
fn has_noindex_files(current_dir: &Path) -> bool {
if current_dir.join(".noindex").exists() {
return true;
}
let common_paths = [
"src",
"lib",
"tests",
"test",
"docs",
"doc",
"examples",
"example",
"target",
"build",
"dist",
"node_modules",
".git",
"vendor",
];
for subdir in &common_paths {
let noindex_path = current_dir.join(subdir).join(".noindex");
if noindex_path.exists() {
return true;
}
}
false
}
pub fn create_matcher(current_dir: &Path, quiet: bool) -> Result<ignore::gitignore::Gitignore> {
let mut builder = ignore::gitignore::GitignoreBuilder::new(current_dir);
let gitignore_path = current_dir.join(".gitignore");
if gitignore_path.exists() {
if let Some(e) = builder.add(&gitignore_path) {
if !quiet {
eprintln!("Warning: Failed to load .gitignore file: {}", e);
}
} }
let noindex_path = current_dir.join(".noindex");
if noindex_path.exists() {
if let Some(e) = builder.add(&noindex_path) {
if !quiet {
eprintln!("Warning: Failed to load .noindex file for matcher: {}", e);
}
} }
Ok(builder.build()?)
}
}
pub mod git {
use super::GitUtils;
use anyhow::Result;
use std::path::Path;
pub fn is_git_repo_root(path: &Path) -> bool {
GitUtils::is_git_repo_root(path)
}
pub fn find_git_root(start_path: &Path) -> Option<std::path::PathBuf> {
GitUtils::find_git_root(start_path)
}
pub fn get_current_commit_hash(repo_path: &Path) -> Result<String> {
GitUtils::get_current_commit_hash(repo_path)
}
pub fn get_changed_files_since_commit(
repo_path: &Path,
since_commit: &str,
) -> Result<Vec<String>> {
GitUtils::get_changed_files_since_commit(repo_path, since_commit)
}
pub fn get_all_changed_files(repo_path: &Path) -> Result<Vec<String>> {
GitUtils::get_all_changed_files(repo_path)
}
}
pub fn get_file_mtime(file_path: &std::path::Path) -> Result<u64> {
FileUtils::get_file_mtime(file_path)
}
pub fn detect_language(path: &std::path::Path) -> Option<&str> {
FileUtils::detect_language(path)
}
async fn cleanup_deleted_files_optimized(
store: &Store,
current_dir: &std::path::Path,
quiet: bool,
) -> Result<()> {
let indexed_files = store.get_all_indexed_file_paths().await?;
if indexed_files.is_empty() {
return Ok(());
}
let ignore_matcher = NoindexWalker::create_matcher(current_dir, quiet)?;
let mut files_to_remove = Vec::new();
let indexed_files_vec: Vec<String> = indexed_files.into_iter().collect();
const CHUNK_SIZE: usize = 100;
for chunk in indexed_files_vec.chunks(CHUNK_SIZE) {
for indexed_file in chunk {
let absolute_path = current_dir.join(indexed_file);
if !absolute_path.exists() {
files_to_remove.push(indexed_file.clone());
} else {
let is_ignored = ignore_matcher
.matched(&absolute_path, absolute_path.is_dir())
.is_ignore();
if is_ignored {
files_to_remove.push(indexed_file.clone());
}
}
}
if files_to_remove.len() >= CHUNK_SIZE {
for file_to_remove in &files_to_remove {
if let Err(e) = store.remove_blocks_by_path(file_to_remove).await {
if !quiet {
eprintln!(
"Warning: Failed to remove blocks for {}: {}",
file_to_remove, e
);
}
tracing::warn!(
file = %file_to_remove,
error = %e,
"Failed to remove blocks during cleanup"
);
}
}
files_to_remove.clear();
store.flush().await?;
}
}
if !files_to_remove.is_empty() {
for file_to_remove in &files_to_remove {
if let Err(e) = store.remove_blocks_by_path(file_to_remove).await {
if !quiet {
eprintln!(
"Warning: Failed to remove blocks for {}: {}",
file_to_remove, e
);
}
}
}
store.flush().await?;
}
Ok(())
}
async fn flush_if_needed(
store: &Store,
batches_processed: &mut usize,
config: &Config,
force: bool,
) -> Result<bool> {
if force || *batches_processed >= config.index.flush_frequency {
store.flush().await?;
*batches_processed = 0; Ok(true)
} else {
Ok(false)
}
}
fn fast_count_indexable_files(
current_dir: &Path,
git_changed_files: Option<&std::collections::HashSet<String>>,
) -> usize {
let mut count = 0;
if let Some(changed_files) = git_changed_files {
for file_path in changed_files {
let full_path = current_dir.join(file_path);
if full_path.extension().is_some() {
count += 1;
}
}
return count;
}
let walker = NoindexWalker::create_walker(current_dir).build();
for result in walker {
let entry = match result {
Ok(entry) => entry,
Err(_) => continue,
};
if !entry.file_type().is_some_and(|ft| ft.is_file()) {
continue;
}
if let Some(ext) = entry.path().extension() {
let ext_str = ext.to_str().unwrap_or("");
if matches!(
ext_str,
"rs" | "js"
| "ts" | "jsx" | "tsx"
| "py" | "go" | "java"
| "c" | "cpp" | "h"
| "hpp" | "cs" | "php"
| "rb" | "swift"
| "kt" | "scala"
| "r" | "m" | "mm"
| "md" | "markdown"
| "txt" | "json"
| "yaml" | "yml"
| "toml" | "xml"
| "html" | "css"
| "scss" | "sass"
| "less" | "sql"
| "sh" | "bash" | "zsh"
| "fish" | "vim"
| "lua" | "pl" | "pm"
| "t" | "pod" | "raku"
| "rakumod" | "rakudoc"
| "nix" | "dhall"
| "tf" | "tfvars"
| "hcl" | "vue" | "svelte"
| "elm" | "purs"
| "hs" | "lhs" | "ml"
| "mli" | "fs" | "fsi"
| "fsx" | "clj" | "cljs"
| "cljc" | "edn"
| "ex" | "exs" | "erl"
| "hrl" | "zig" | "v"
| "vsh" | "nim" | "nims"
| "cr" | "jl" | "d"
| "dart" | "pas"
| "pp" | "inc" | "asm"
| "s" | "S" | "rst"
| "adoc" | "tex"
| "bib" | "org" | "wiki"
| "pod6" | "rakutest"
| "cfg" | "conf"
| "config" | "ini"
| "env" | "properties"
| "gradle" | "cmake"
| "make" | "makefile"
| "dockerfile" | "containerfile"
| "vagrantfile" | "gemfile"
| "rakefile" | "guardfile"
| "podfile" | "fastfile"
| "brewfile"
) {
count += 1;
}
}
}
count
}
async fn persist_and_store_metadata(
store: &Store,
git_repo_root: Option<&Path>,
config: &Config,
quiet: bool,
context: &str,
) -> Result<()> {
if let Err(e) = store.flush().await {
tracing::error!(
context = context,
error = %e,
"Failed to flush store - metadata will NOT be stored"
);
return Err(e);
}
tracing::debug!(context = context, "Successfully flushed store");
let Some(git_root) = git_repo_root else {
return Ok(());
};
let current_commit = match git::get_current_commit_hash(git_root) {
Ok(hash) => hash,
Err(e) => {
tracing::warn!(
context = context,
error = %e,
"Could not get current commit hash, skipping metadata storage"
);
return Ok(()); }
};
if let Err(e) = store.store_git_metadata(¤t_commit).await {
tracing::error!(
context = context,
commit = %current_commit,
error = %e,
"Failed to store git metadata"
);
if !quiet {
eprintln!("Warning: Could not store git metadata: {}", e);
}
} else {
tracing::debug!(
context = context,
commit = %current_commit,
"Successfully stored git metadata"
);
}
if config.graphrag.enabled {
if let Err(e) = store.store_graphrag_commit_hash(¤t_commit).await {
tracing::error!(
context = context,
commit = %current_commit,
error = %e,
"Failed to store GraphRAG git metadata"
);
if !quiet {
eprintln!("Warning: Could not store GraphRAG git metadata: {}", e);
}
} else {
tracing::debug!(
context = context,
commit = %current_commit,
"Successfully stored GraphRAG metadata"
);
}
}
Ok(())
}
pub async fn index_files(
store: &Store,
state: SharedState,
config: &Config,
git_repo_root: Option<&Path>,
) -> Result<()> {
index_files_with_quiet(store, state, config, git_repo_root, false).await
}
pub async fn index_files_with_quiet(
store: &Store,
state: SharedState,
config: &Config,
git_repo_root: Option<&Path>,
quiet: bool,
) -> Result<()> {
let current_dir = state.read().current_directory.clone();
let mut code_blocks_batch = Vec::new();
let mut text_blocks_batch = Vec::new();
let mut document_blocks_batch = Vec::new();
let mut all_code_blocks = Vec::new();
let mut embedding_calls = 0;
let mut batches_processed = 0;
log_indexing_progress(
"indexing_start",
0,
0,
Some(¤t_dir.display().to_string()),
0,
);
{
let mut state_guard = state.write();
state_guard.graphrag_enabled = config.graphrag.enabled;
state_guard.graphrag_blocks = 0;
state_guard.counting_files = true;
state_guard.status_message = "Counting files...".to_string();
state_guard.quiet_mode = quiet;
}
let force_reindex = state.read().force_reindex;
let git_changed_files = if let Some(git_root) = git_repo_root {
if !force_reindex {
if let Ok(Some(last_commit)) = store.get_last_commit_hash().await {
if let Ok(current_commit) = git::get_current_commit_hash(git_root) {
if last_commit != current_commit {
match git::get_changed_files_since_commit(git_root, &last_commit) {
Ok(changed_files) => {
if !quiet {
println!(
"🚀 Git optimization: Commit changed, found {} files to reindex",
changed_files.len()
);
}
for file_path in &changed_files {
if let Err(e) = store.remove_blocks_by_path(file_path).await {
if !quiet {
eprintln!(
"Warning: Failed to clean up data for {}: {}",
file_path, e
);
}
}
}
if !changed_files.is_empty() {
store.flush().await?;
tracing::debug!(
files_cleaned = changed_files.len(),
"Flushed after cleanup of changed files"
);
}
Some(
changed_files
.into_iter()
.collect::<std::collections::HashSet<_>>(),
)
}
Err(e) => {
if !quiet {
eprintln!(
"Warning: Could not get git changes, indexing all files: {}",
e
);
}
None
}
}
} else {
if !quiet {
println!("✅ No commit changes since last index, skipping reindex");
}
if config.graphrag.enabled {
let needs_graphrag_from_existing =
match store.graphrag_needs_indexing().await {
Ok(v) => v,
Err(e) => {
tracing::warn!(
error = %e,
"Failed to check if GraphRAG needs indexing, assuming false"
);
false
}
};
if needs_graphrag_from_existing {
if !quiet {
println!("🔗 Building GraphRAG from existing database...");
}
log_indexing_progress("graphrag_build", 0, 0, None, 0);
let graph_builder =
graphrag::GraphBuilder::new_with_quiet(config.clone(), quiet)
.await?;
graph_builder
.build_from_existing_database(Some(state.clone()))
.await?;
persist_and_store_metadata(
store,
Some(git_root),
config,
quiet,
"graphrag_from_existing",
)
.await?;
}
}
{
let mut state_guard = state.write();
state_guard.indexing_complete = true;
}
return Ok(());
}
} else {
if !quiet {
println!("⚠️ Could not get current commit hash, indexing all files");
}
None
}
} else {
if !quiet {
println!("📋 First-time git indexing: indexing all files");
}
None
}
} else {
None
}
} else {
None
};
let should_cleanup_deleted_files = {
let force_reindex = state.read().force_reindex;
!force_reindex };
if should_cleanup_deleted_files {
{
let mut state_guard = state.write();
state_guard.status_message = "Checking for deleted files...".to_string();
}
log_indexing_progress("cleanup", 0, 0, None, 0);
if let Err(e) = cleanup_deleted_files_optimized(store, ¤t_dir, quiet).await {
if !quiet {
eprintln!("Warning: Cleanup failed: {}", e);
}
}
{
let mut state_guard = state.write();
state_guard.status_message = "".to_string();
}
}
{
let mut state_guard = state.write();
state_guard.status_message = "Loading file metadata...".to_string();
}
let file_metadata_map = store.get_all_file_metadata().await?;
if !quiet {
println!(
"📊 Loaded metadata for {} files from database",
file_metadata_map.len()
);
}
{
let mut state_guard = state.write();
state_guard.total_files = 0; state_guard.counting_files = true;
state_guard.status_message = "Starting indexing...".to_string();
}
let total_files_found;
let mut files_processed = 0;
log_indexing_progress("file_processing", 0, 0, None, 0);
if let Some(ref changed_files) = git_changed_files {
total_files_found = fast_count_indexable_files(¤t_dir, Some(changed_files));
{
let mut state_guard = state.write();
state_guard.total_files = total_files_found;
state_guard.counting_files = false;
state_guard.status_message = "".to_string();
}
for file_path in changed_files {
let full_path = current_dir.join(file_path);
if !full_path.is_file() {
continue;
}
let force_reindex = state.read().force_reindex;
if !force_reindex {
if let Ok(actual_mtime) = get_file_mtime(&full_path) {
if let Some(stored_mtime) = file_metadata_map.get(file_path) {
if actual_mtime <= *stored_mtime {
{
let mut state_guard = state.write();
state_guard.skipped_files += 1;
}
continue;
}
}
}
}
if let Some(language) = detect_language(&full_path) {
match fs::read_to_string(&full_path) {
Ok(contents) => {
let file_processed;
if language == "markdown" {
process_markdown_file_differential(
store,
&contents,
file_path,
&mut document_blocks_batch,
config,
state.clone(),
)
.await?;
file_processed = true;
} else {
let ctx = ProcessFileContext {
store,
config,
state: state.clone(),
};
process_file_differential(
&ctx,
&contents,
file_path,
language,
&mut code_blocks_batch,
&mut text_blocks_batch, &mut all_code_blocks,
)
.await?;
file_processed = true;
}
if file_processed {
if let Ok(actual_mtime) = get_file_mtime(&full_path) {
let _ = store.store_file_metadata(file_path, actual_mtime).await;
}
}
files_processed += 1;
state.write().indexed_files = files_processed;
if files_processed % 50 == 0 {
let current_total = state.read().total_files;
log_indexing_progress(
"file_processing",
files_processed,
current_total,
Some(file_path),
embedding_calls,
);
}
if should_process_batch(&code_blocks_batch, |b| &b.content, config) {
embedding_calls += code_blocks_batch.len();
process_code_blocks_batch(store, &code_blocks_batch, config).await?;
code_blocks_batch.clear();
batches_processed += 1;
flush_if_needed(store, &mut batches_processed, config, false).await?;
}
if should_process_batch(&text_blocks_batch, |b| &b.content, config) {
embedding_calls += text_blocks_batch.len();
process_text_blocks_batch(store, &text_blocks_batch, config).await?;
text_blocks_batch.clear();
batches_processed += 1;
flush_if_needed(store, &mut batches_processed, config, false).await?;
}
if should_process_batch(&document_blocks_batch, |b| &b.content, config) {
embedding_calls += document_blocks_batch.len();
process_document_blocks_batch(store, &document_blocks_batch, config)
.await?;
document_blocks_batch.clear();
batches_processed += 1;
flush_if_needed(store, &mut batches_processed, config, false).await?;
}
}
Err(e) => {
log_file_processing_error(file_path, "read_file", &e);
}
}
} else {
if is_allowed_text_extension(&full_path) && !is_markdown_file(&full_path) {
if let Ok(contents) = fs::read_to_string(&full_path) {
if is_text_file(&contents) {
process_text_file_differential(
store,
&contents,
file_path,
&mut text_blocks_batch,
config,
state.clone(),
)
.await?;
if let Ok(actual_mtime) = get_file_mtime(&full_path) {
let _ = store.store_file_metadata(file_path, actual_mtime).await;
}
files_processed += 1;
state.write().indexed_files = files_processed;
if files_processed % 50 == 0 {
let current_total = state.read().total_files;
log_indexing_progress(
"file_processing",
files_processed,
current_total,
Some(file_path),
embedding_calls,
);
}
if should_process_batch(&text_blocks_batch, |b| &b.content, config) {
embedding_calls += text_blocks_batch.len();
process_text_blocks_batch(store, &text_blocks_batch, config)
.await?;
text_blocks_batch.clear();
batches_processed += 1;
flush_if_needed(store, &mut batches_processed, config, false)
.await?;
}
}
}
}
}
}
} else {
total_files_found = fast_count_indexable_files(¤t_dir, None);
{
let mut state_guard = state.write();
state_guard.total_files = total_files_found;
state_guard.counting_files = false;
state_guard.status_message = "".to_string();
}
let walker = NoindexWalker::create_walker(¤t_dir).build();
for result in walker {
let entry = match result {
Ok(entry) => entry,
Err(_) => continue,
};
if !entry.file_type().is_some_and(|ft| ft.is_file()) {
continue;
}
let file_path = path_utils::PathUtils::to_relative_string(entry.path(), ¤t_dir);
let force_reindex = state.read().force_reindex;
if !force_reindex {
if let Ok(actual_mtime) = get_file_mtime(entry.path()) {
if let Some(stored_mtime) = file_metadata_map.get(&file_path) {
if actual_mtime <= *stored_mtime {
{
let mut state_guard = state.write();
state_guard.skipped_files += 1;
}
continue;
}
}
}
}
if let Some(language) = detect_language(entry.path()) {
match fs::read_to_string(entry.path()) {
Ok(contents) => {
let file_processed;
if language == "markdown" {
process_markdown_file_differential(
store,
&contents,
&file_path,
&mut document_blocks_batch,
config,
state.clone(),
)
.await?;
file_processed = true;
} else {
let ctx = ProcessFileContext {
store,
config,
state: state.clone(),
};
process_file_differential(
&ctx,
&contents,
&file_path,
language,
&mut code_blocks_batch,
&mut text_blocks_batch, &mut all_code_blocks,
)
.await?;
file_processed = true;
}
if file_processed {
if let Ok(actual_mtime) = get_file_mtime(entry.path()) {
let _ = store.store_file_metadata(&file_path, actual_mtime).await;
}
}
files_processed += 1;
state.write().indexed_files = files_processed;
if files_processed % 50 == 0 {
let current_total = state.read().total_files;
log_indexing_progress(
"file_processing",
files_processed,
current_total,
Some(&file_path),
embedding_calls,
);
}
if should_process_batch(&code_blocks_batch, |b| &b.content, config) {
embedding_calls += code_blocks_batch.len();
process_code_blocks_batch(store, &code_blocks_batch, config).await?;
code_blocks_batch.clear();
batches_processed += 1;
flush_if_needed(store, &mut batches_processed, config, false).await?;
}
if should_process_batch(&text_blocks_batch, |b| &b.content, config) {
embedding_calls += text_blocks_batch.len();
process_text_blocks_batch(store, &text_blocks_batch, config).await?;
text_blocks_batch.clear();
batches_processed += 1;
flush_if_needed(store, &mut batches_processed, config, false).await?;
}
if should_process_batch(&document_blocks_batch, |b| &b.content, config) {
embedding_calls += document_blocks_batch.len();
process_document_blocks_batch(store, &document_blocks_batch, config)
.await?;
document_blocks_batch.clear();
batches_processed += 1;
flush_if_needed(store, &mut batches_processed, config, false).await?;
}
}
Err(e) => {
log_file_processing_error(&file_path, "read_file", &e);
}
}
} else {
if is_allowed_text_extension(entry.path()) && !is_markdown_file(entry.path()) {
if let Ok(contents) = fs::read_to_string(entry.path()) {
if is_text_file(&contents) {
process_text_file_differential(
store,
&contents,
&file_path,
&mut text_blocks_batch,
config,
state.clone(),
)
.await?;
if let Ok(actual_mtime) = get_file_mtime(entry.path()) {
let _ = store.store_file_metadata(&file_path, actual_mtime).await;
}
files_processed += 1;
state.write().indexed_files = files_processed;
if files_processed % 50 == 0 {
let current_total = state.read().total_files;
log_indexing_progress(
"file_processing",
files_processed,
current_total,
Some(&file_path),
embedding_calls,
);
}
if should_process_batch(&text_blocks_batch, |b| &b.content, config) {
embedding_calls += text_blocks_batch.len();
process_text_blocks_batch(store, &text_blocks_batch, config)
.await?;
text_blocks_batch.clear();
batches_processed += 1;
flush_if_needed(store, &mut batches_processed, config, false)
.await?;
}
}
}
}
}
}
}
if !code_blocks_batch.is_empty() {
process_code_blocks_batch(store, &code_blocks_batch, config).await?;
embedding_calls += code_blocks_batch.len();
batches_processed += 1;
}
if !text_blocks_batch.is_empty() {
process_text_blocks_batch(store, &text_blocks_batch, config).await?;
embedding_calls += text_blocks_batch.len();
batches_processed += 1;
}
if !document_blocks_batch.is_empty() {
process_document_blocks_batch(store, &document_blocks_batch, config).await?;
embedding_calls += document_blocks_batch.len();
batches_processed += 1;
}
flush_if_needed(store, &mut batches_processed, config, true).await?;
if config.graphrag.enabled {
let needs_graphrag_from_existing = if all_code_blocks.is_empty() {
let existing_blocks = store
.get_all_code_blocks_for_graphrag()
.await
.unwrap_or_default();
let needs_indexing = match store.graphrag_needs_indexing().await {
Ok(v) => v,
Err(e) => {
tracing::warn!(
error = %e,
"Failed to check if GraphRAG needs indexing, assuming false"
);
false
}
};
!existing_blocks.is_empty() && needs_indexing
} else {
false };
if !all_code_blocks.is_empty() || needs_graphrag_from_existing {
{
let mut state_guard = state.write();
if needs_graphrag_from_existing {
state_guard.status_message =
"Building GraphRAG from existing database...".to_string();
} else {
state_guard.status_message = "Building GraphRAG knowledge graph...".to_string();
}
}
log_indexing_progress(
"graphrag_build",
state.read().indexed_files,
state.read().total_files,
None,
embedding_calls,
);
let graph_builder =
graphrag::GraphBuilder::new_with_quiet(config.clone(), quiet).await?;
if needs_graphrag_from_existing {
graph_builder
.build_from_existing_database(Some(state.clone()))
.await?;
} else {
graph_builder
.process_code_blocks(&all_code_blocks, Some(state.clone()))
.await?;
}
{
let mut state_guard = state.write();
state_guard.status_message = "".to_string();
}
}
}
{
let mut state_guard = state.write();
state_guard.indexing_complete = true;
state_guard.embedding_calls = embedding_calls;
}
let final_files = state.read().indexed_files;
let final_total = state.read().total_files;
log_indexing_progress(
"indexing_complete",
final_files,
final_total,
None,
embedding_calls,
);
let should_store_metadata = final_files > 0
|| git_changed_files.is_some()
|| (total_files_found == 0 && git_changed_files.is_none());
if should_store_metadata {
persist_and_store_metadata(store, git_repo_root, config, quiet, "indexing_complete")
.await?;
} else if !quiet {
println!("⚠️ Git metadata not stored - no files processed and no git optimization used");
}
Ok(())
}
pub async fn handle_file_change(store: &Store, file_path: &str, config: &Config) -> Result<()> {
let state = state::create_shared_state();
{
let mut state_guard = state.write();
state_guard.graphrag_enabled = config.graphrag.enabled;
state_guard.graphrag_blocks = 0;
}
store.remove_blocks_by_path(file_path).await?;
let path = std::path::Path::new(file_path);
if path.exists() {
let current_dir = std::env::current_dir()?;
let absolute_path = if path.is_absolute() {
path.to_path_buf()
} else {
current_dir.join(path)
};
if let Ok(matcher) = NoindexWalker::create_matcher(¤t_dir, true) {
if matcher
.matched(&absolute_path, absolute_path.is_dir())
.is_ignore()
{
return Ok(());
}
}
if let Some(language) = detect_language(&absolute_path) {
if let Ok(contents) = fs::read_to_string(&absolute_path) {
let relative_file_path =
path_utils::PathUtils::to_relative_string(&absolute_path, ¤t_dir);
if language == "markdown" {
let mut document_blocks_batch = Vec::new();
process_markdown_file(
store,
&contents,
&relative_file_path,
&mut document_blocks_batch,
config,
state.clone(),
)
.await?;
if !document_blocks_batch.is_empty() {
process_document_blocks_batch(store, &document_blocks_batch, config)
.await?;
}
} else {
let mut code_blocks_batch = Vec::new();
let mut text_blocks_batch = Vec::new(); let mut all_code_blocks = Vec::new();
let ctx = ProcessFileContext {
store,
config,
state: state.clone(),
};
process_file(
&ctx,
&contents,
&relative_file_path,
language,
&mut code_blocks_batch,
&mut text_blocks_batch,
&mut all_code_blocks,
)
.await?;
if !code_blocks_batch.is_empty() {
process_code_blocks_batch(store, &code_blocks_batch, config).await?;
}
if config.graphrag.enabled && !all_code_blocks.is_empty() {
let graph_builder = graphrag::GraphBuilder::new(config.clone()).await?;
graph_builder
.process_code_blocks(&all_code_blocks, Some(state.clone()))
.await?;
}
}
store.flush().await?;
}
} else {
if is_allowed_text_extension(&absolute_path) {
if let Ok(contents) = fs::read_to_string(&absolute_path) {
if is_text_file(&contents) {
let relative_file_path =
path_utils::PathUtils::to_relative_string(&absolute_path, ¤t_dir);
let mut text_blocks_batch = Vec::new();
process_text_file(
store,
&contents,
&relative_file_path,
&mut text_blocks_batch,
config,
state.clone(),
)
.await?;
if !text_blocks_batch.is_empty() {
process_text_blocks_batch(store, &text_blocks_batch, config).await?;
}
store.flush().await?;
}
}
}
}
}
Ok(())
}
#[cfg(test)]
mod context_optimization_tests {
use super::*;
#[test]
fn test_context_optimization() {
let doc_block = DocumentBlock {
path: "test.md".to_string(),
title: "Test Section".to_string(),
content: "This is the actual content.".to_string(),
context: vec![
"# Main Document".to_string(),
"## Authentication".to_string(),
"### JWT Implementation".to_string(),
],
level: 3,
start_line: 10,
end_line: 15,
hash: "test_hash".to_string(),
distance: None,
};
let embedding_text = if !doc_block.context.is_empty() {
format!("{}\n\n{}", doc_block.context.join("\n"), doc_block.content)
} else {
doc_block.content.clone()
};
assert!(embedding_text.contains("# Main Document"));
assert!(embedding_text.contains("## Authentication"));
assert!(embedding_text.contains("### JWT Implementation"));
assert!(embedding_text.contains("This is the actual content."));
let storage_size = doc_block.content.len();
let context_size: usize = doc_block.context.iter().map(|s| s.len()).sum();
let total_size = storage_size + context_size;
let old_approach_size = embedding_text.len() + doc_block.content.len();
assert!(total_size < old_approach_size);
println!("New approach size: {} bytes", total_size);
println!("Old approach size: {} bytes", old_approach_size);
println!(
"Memory savings: {}%",
((old_approach_size - total_size) as f64 / old_approach_size as f64 * 100.0) as i32
);
}
#[test]
fn test_empty_context() {
let doc_block = DocumentBlock {
path: "test.md".to_string(),
title: "Test Section".to_string(),
content: "Content without context.".to_string(),
context: Vec::new(), level: 1,
start_line: 0,
end_line: 5,
hash: "test_hash".to_string(),
distance: None,
};
let embedding_text = if !doc_block.context.is_empty() {
format!("{}\n\n{}", doc_block.context.join("\n"), doc_block.content)
} else {
doc_block.content.clone()
};
assert_eq!(embedding_text, doc_block.content);
}
#[test]
fn test_smart_chunking_eliminates_tiny_chunks() {
let test_content = r#"# Main Document
## Section A
Some content here.
### Tiny Subsection
Only 33 symbols here - very small!
### Another Tiny
Also small content.
## Section B
This has more substantial content that should be fine on its own.
It has multiple lines and provides good context for understanding.
### Small Child
Brief content.
"#;
let hierarchy = parse_document_hierarchy(test_content);
let chunks = hierarchy.bottom_up_chunking(2000);
let tiny_chunks: Vec<&ChunkResult> = chunks
.iter()
.filter(|chunk| chunk.storage_content.len() < 100)
.collect();
println!("Generated {} chunks total", chunks.len());
for (i, chunk) in chunks.iter().enumerate() {
println!(
"Chunk {}: {} chars - '{}'",
i + 1,
chunk.storage_content.len(),
chunk.title
);
}
if !tiny_chunks.is_empty() {
println!("Found {} tiny chunks:", tiny_chunks.len());
for chunk in &tiny_chunks {
println!("- '{}': {} chars", chunk.title, chunk.storage_content.len());
}
}
assert!(
tiny_chunks.len() <= 1,
"Should have at most 1 tiny chunk after smart merging"
);
}
}