Skip to main content

codesearch/index/
mod.rs

1use anyhow::Result;
2use colored::Colorize;
3use indicatif::{ProgressBar, ProgressStyle};
4use std::fs;
5use std::path::{Path, PathBuf};
6use std::time::Instant;
7use tokio_util::sync::CancellationToken;
8use tracing::{debug, info};
9
10use crate::cache::{normalize_path, FileMetaStore};
11use crate::chunker::SemanticChunker;
12use crate::db_discovery::{find_best_database, register_repository, unregister_repository};
13use crate::embed::{EmbeddingService, ModelType};
14use crate::file::FileWalker;
15use crate::fts::FtsStore;
16use crate::vectordb::VectorStore;
17
18// Index manager module
19mod manager;
20pub use manager::{IndexManager, SharedStores};
21
22/// Get the database path and project path for a given directory
23/// Uses automatic database discovery to find indexes in parent/global directories
24fn get_db_path(path: Option<PathBuf>) -> Result<(PathBuf, PathBuf)> {
25    use crate::db_discovery::resolve_database_with_message;
26    resolve_database_with_message(path.as_deref(), "indexing")
27}
28
29/// Smart database path resolution that handles global/local/force scenarios
30/// Ensures only ONE database per repository (local or global, never both)
31///
32/// # Safety Checks
33/// - Detects git/hg/svn roots to prevent indexing subdirs
34/// - Warns if trying to create a db in a non-root directory
35fn get_db_path_smart(
36    path: Option<PathBuf>,
37    global: bool,
38    force: bool,
39) -> Result<(PathBuf, PathBuf)> {
40    let target = path.as_deref();
41    let project_path = path.as_deref().unwrap_or(Path::new("."));
42
43    // Try to canonicalize, but fall back to original path if it fails
44    // Then normalize: strip UNC prefix (\\?\) and use forward slashes for consistency
45    let canonical_path = PathBuf::from(normalize_path(
46        &project_path
47            .canonicalize()
48            .unwrap_or_else(|_| PathBuf::from(project_path)),
49    ));
50
51    // Step 1: Check if there's an existing database (local or global)
52    let existing_db = find_best_database(target)?;
53
54    // Step 2: Handle --force flag
55    if force {
56        if let Some(ref db_info) = existing_db {
57            // Delete existing database (local or global)
58            println!(
59                "{}",
60                format!(
61                    "šŸ—‘ļø  Force rebuild: deleting existing database at {}",
62                    db_info.db_path.display()
63                )
64                .yellow()
65            );
66            std::fs::remove_dir_all(&db_info.db_path)?;
67            // Wait for Windows to fully release file handles (memory-mapped files
68            // from LMDB/tantivy may not be immediately released after deletion)
69            // Increased to 1000ms to handle slow file handle release on Windows
70            std::thread::sleep(std::time::Duration::from_millis(1000));
71            println!("āœ… Existing database deleted");
72        }
73        // After deletion, continue to create new database
74    }
75
76    // Step 3: Handle --global flag
77    if global {
78        // User explicitly wants global database
79        if let Some(ref db_info) = existing_db {
80            if !force && db_info.is_global {
81                // Global database already exists, use it
82                println!(
83                    "{}",
84                    format!(
85                        "šŸŒ Using existing global database: {}",
86                        db_info.db_path.display()
87                    )
88                    .dimmed()
89                );
90                return Ok((db_info.db_path.clone(), db_info.project_path.clone()));
91            } else if !force && !db_info.is_global {
92                // Local database exists but user wants global
93                println!(
94                    "{}",
95                    format!(
96                        "āš ļø  Local database exists at {}\n   Moving to global database...",
97                        db_info.db_path.display()
98                    )
99                    .yellow()
100                );
101                // Delete local database
102                std::fs::remove_dir_all(&db_info.db_path)?;
103                println!("āœ… Local database removed");
104            }
105        }
106        // Create or use global database
107        return get_global_db_path(path);
108    }
109
110    // Step 4: Use automatic discovery (default behavior)
111    if let Some(ref db_info) = existing_db {
112        // Use existing database (local or global)
113        if !db_info.is_current {
114            let current_dir = std::env::current_dir().unwrap_or_else(|_| PathBuf::from("."));
115            let relative_path = if let Ok(rel) = current_dir.strip_prefix(&db_info.project_path) {
116                format!("./{}", rel.display())
117            } else {
118                db_info.project_path.display().to_string()
119            };
120            println!(
121                "{}",
122                format!(
123                    "šŸ“‚ Using database from: {}\n   (indexing from subfolder, project root: {})",
124                    db_info.db_path.display(),
125                    relative_path
126                )
127                .dimmed()
128            );
129        }
130        return Ok((db_info.db_path.clone(), db_info.project_path.clone()));
131    }
132
133    // Step 5: No existing database - SAFETY CHECK before creating
134    // Detect if we're in a subdirectory of a project (git/hg/svn root detection)
135    let project_root = find_project_root(&canonical_path);
136
137    if let Some(root) = project_root {
138        if root != canonical_path {
139            // We're in a subdirectory of a project!
140            println!(
141                "{}",
142                format!(
143                    "āš ļø  You are in a subdirectory: {}\n   Project root detected at: {}",
144                    canonical_path.display(),
145                    root.display()
146                )
147                .yellow()
148            );
149            println!(
150                "{}",
151                "   Creating database at project root to avoid duplicate indexes.".yellow()
152            );
153            let db_path = root.join(".codesearch.db");
154            return Ok((db_path, root));
155        }
156    } else {
157        // No project markers found - warn the user
158        println!(
159            "{}",
160            format!(
161                "ā„¹ļø  No project root detected (no .git, Cargo.toml, package.json, etc.)\n   Creating database in: {}",
162                canonical_path.display()
163            ).dimmed()
164        );
165        println!(
166            "{}",
167            "   Tip: If this is a subdirectory, run 'codesearch index' from the project root."
168                .dimmed()
169        );
170    }
171
172    // Step 6: Create local database in current directory
173    let db_path = canonical_path.join(".codesearch.db");
174    Ok((db_path, canonical_path))
175}
176
177/// Find the project root by looking for version control directories
178/// Returns the directory containing .git, .hg, .svn, or Cargo.toml/package.json
179fn find_project_root(start_path: &Path) -> Option<PathBuf> {
180    // Project markers in order of priority
181    let markers = [
182        ".git",           // Git repository
183        ".hg",            // Mercurial repository
184        ".svn",           // Subversion repository
185        "Cargo.toml",     // Rust project
186        "package.json",   // Node.js project
187        "pyproject.toml", // Python project
188        "go.mod",         // Go project
189        ".sln",           // .NET solution (check for any .sln file)
190    ];
191
192    let mut current = start_path.to_path_buf();
193
194    loop {
195        // Check for project markers
196        for marker in &markers {
197            let marker_path = current.join(marker);
198            if marker_path.exists() {
199                return Some(current);
200            }
201        }
202
203        // Also check for .sln files (glob pattern)
204        if let Ok(entries) = std::fs::read_dir(&current) {
205            for entry in entries.flatten() {
206                if let Some(ext) = entry.path().extension() {
207                    if ext == "sln" {
208                        return Some(current);
209                    }
210                }
211            }
212        }
213
214        // Move to parent directory
215        if let Some(parent) = current.parent() {
216            current = parent.to_path_buf();
217        } else {
218            break;
219        }
220    }
221
222    None
223}
224
225/// Get the global database path for a given directory
226/// Uses ~/.codesearch.dbs/<project_name>/ for storage
227fn get_global_db_path(path: Option<PathBuf>) -> Result<(PathBuf, PathBuf)> {
228    use dirs::home_dir;
229
230    let project_path = path.unwrap_or_else(|| PathBuf::from("."));
231    let canonical_path = project_path.canonicalize()?;
232
233    // Create a unique name for the project based on its path
234    // Use the directory name as the project identifier
235    let project_name = canonical_path
236        .file_name()
237        .and_then(|n| n.to_str())
238        .unwrap_or("unknown");
239
240    // Create global database directory
241    let home = home_dir().ok_or_else(|| anyhow::anyhow!("No home directory found"))?;
242    let global_db_dir = home.join(".codesearch.dbs").join(project_name);
243    let db_path = global_db_dir.join(".codesearch.db");
244
245    // Register this repository in the global tracking
246    register_repository(&canonical_path)?;
247
248    println!(
249        "{}",
250        format!(
251            "šŸŒ Using global database: {}\n   (project: {})",
252            db_path.display(),
253            project_name
254        )
255        .dimmed()
256    );
257
258    Ok((db_path, canonical_path))
259}
260
261/// Index a repository
262///
263/// # Arguments
264/// * `path` - Path to index (defaults to current directory)
265/// * `dry_run` - Preview what would be indexed without indexing
266/// * `force` - Delete existing index and rebuild from scratch
267/// * `global` - Create global index instead of local
268/// * `model` - Override embedding model
269/// * `quiet` - Suppress verbose output (for server/MCP mode)
270pub async fn index(
271    path: Option<PathBuf>,
272    dry_run: bool,
273    force: bool,
274    global: bool,
275    model: Option<ModelType>,
276    cancel_token: CancellationToken,
277) -> Result<()> {
278    index_with_options(path, dry_run, force, global, model, false, cancel_token).await
279}
280
281/// Index a repository with quiet mode option (for server/MCP use)
282pub async fn index_quiet(
283    path: Option<PathBuf>,
284    force: bool,
285    cancel_token: CancellationToken,
286) -> Result<()> {
287    index_with_options(path, false, force, false, None, true, cancel_token).await
288}
289
290/// Internal index function with all options
291async fn index_with_options(
292    path: Option<PathBuf>,
293    dry_run: bool,
294    force: bool,
295    global: bool,
296    model: Option<ModelType>,
297    quiet: bool,
298    cancel_token: CancellationToken,
299) -> Result<()> {
300    let (db_path, project_path) = get_db_path_smart(path, global, force)?;
301    let model_type = model.unwrap_or_default();
302
303    // Macro to conditionally print
304    macro_rules! log_print {
305        ($($arg:tt)*) => {
306            if !quiet {
307                println!($($arg)*);
308            }
309        };
310    }
311
312    log_print!("{}", "šŸš€ Codesearch Indexer".bright_cyan().bold());
313    log_print!("{}", "=".repeat(60));
314    log_print!("šŸ“‚ Project: {}", project_path.display());
315    log_print!("šŸ’¾ Database: {}", db_path.display());
316    log_print!(
317        "🧠 Model: {} ({} dims)",
318        model_type.name(),
319        model_type.dimensions()
320    );
321
322    if dry_run {
323        log_print!("\n{}", "šŸ” DRY RUN MODE".bright_yellow());
324    }
325
326    // Phase 1: File Discovery
327    log_print!("\n{}", "Phase 1: File Discovery".bright_cyan());
328    log_print!("{}", "-".repeat(60));
329
330    let start = Instant::now();
331    let walker = FileWalker::new(project_path.clone());
332    let (mut files, stats) = walker.walk()?;
333    let discovery_duration = start.elapsed();
334
335    log_print!(
336        "āœ… Found {} indexable files in {:?}",
337        files.len(),
338        discovery_duration
339    );
340    log_print!("   Total files scanned: {}", stats.total_files);
341    log_print!("   Binary/skipped: {}", stats.skipped_binary);
342    log_print!("   Total size: {:.2} MB", stats.total_size_mb());
343
344    if files.is_empty() {
345        log_print!("\n{}", "No files to index!".yellow());
346        return Ok(());
347    }
348
349    if dry_run {
350        log_print!("\n{}", "Dry run complete!".green());
351        return Ok(());
352    }
353
354    let is_incremental = db_path.exists() && !force;
355
356    // Load FileMetaStore for incremental indexing (will be used later to update metadata)
357    let mut file_meta_store = if is_incremental {
358        log_print!("\n{}", "šŸ“Š Incremental Indexing".bright_cyan());
359        log_print!("{}", "-".repeat(60));
360
361        Some(FileMetaStore::load_or_create(
362            &db_path,
363            model_type.name(),
364            model_type.dimensions(),
365        )?)
366    } else {
367        None
368    };
369
370    if is_incremental {
371        let file_meta_store = file_meta_store.as_mut().unwrap();
372
373        // Find changed and deleted files
374        let mut changed_files = Vec::new();
375        let mut unchanged_files = 0;
376
377        for file in &files {
378            let (needs_reindex, _old_chunk_ids) = file_meta_store.check_file(&file.path)?;
379
380            if needs_reindex {
381                changed_files.push(file.clone());
382                debug!("šŸ“ File changed (needs reindex): {}", file.path.display());
383            } else {
384                unchanged_files += 1;
385                debug!("āœ… File unchanged: {}", file.path.display());
386            }
387        }
388
389        // Find deleted files (in metadata but not on disk)
390        let deleted_files = file_meta_store.find_deleted_files();
391
392        for (file_path, _chunk_ids) in &deleted_files {
393            debug!("šŸ—‘ļø  File deleted from disk: {}", file_path);
394        }
395
396        log_print!("   Unchanged files: {}", unchanged_files);
397        log_print!("   Changed files: {}", changed_files.len());
398        log_print!("   Deleted files: {}", deleted_files.len());
399
400        // If no changes and no deleted files, we're done
401        if changed_files.is_empty() && deleted_files.is_empty() {
402            log_print!("\n{}", "āœ… Database is up to date!".green());
403            return Ok(());
404        }
405
406        // Delete chunks for changed and deleted files
407        let mut total_chunks_to_delete = 0u32;
408        for (_, chunk_ids) in deleted_files.iter() {
409            total_chunks_to_delete += chunk_ids.len() as u32;
410        }
411        for file in &changed_files {
412            let (_, chunk_ids) = file_meta_store.check_file(&file.path)?;
413            total_chunks_to_delete += chunk_ids.len() as u32;
414        }
415
416        if total_chunks_to_delete > 0 {
417            log_print!("\nšŸ”„ Deleting {} old chunks...", total_chunks_to_delete);
418
419            let mut store = VectorStore::new(&db_path, 384)?; // Will load dimensions from DB
420            let mut fts_store = FtsStore::new_with_writer(&db_path)?;
421
422            // Delete deleted files' metadata and chunks
423            for (file_path, chunk_ids) in deleted_files {
424                if !chunk_ids.is_empty() {
425                    info!(
426                        "šŸ—‘ļø  Deleting {} chunks for deleted file: {}",
427                        chunk_ids.len(),
428                        file_path
429                    );
430                    debug!("   File path: {}", file_path);
431                    store.delete_chunks(&chunk_ids)?;
432                    for chunk_id in &chunk_ids {
433                        fts_store.delete_chunk(*chunk_id)?;
434                    }
435                }
436                file_meta_store.remove_file(Path::new(&file_path));
437            }
438
439            // Delete changed files' old chunks
440            for file in &changed_files {
441                let (_, old_chunk_ids) = file_meta_store.check_file(&file.path)?;
442                if !old_chunk_ids.is_empty() {
443                    let file_path_str = file.path.to_string_lossy().to_string();
444                    info!(
445                        "šŸ”„ Deleting {} old chunks for changed file: {}",
446                        old_chunk_ids.len(),
447                        file_path_str
448                    );
449                    debug!("   File path: {}", file.path.display());
450                    store.delete_chunks(&old_chunk_ids)?;
451                    for chunk_id in &old_chunk_ids {
452                        fts_store.delete_chunk(*chunk_id)?;
453                    }
454                }
455            }
456
457            fts_store.commit()?;
458
459            // Rebuild vector index after deletions - critical for ANN search correctness
460            log_print!("šŸ”Ø Rebuilding vector index after deletions...");
461            store.build_index()?;
462
463            log_print!("āœ… Deleted {} chunks", total_chunks_to_delete);
464
465            // Explicitly drop stores to release LMDB memory map before Phase 2
466            drop(store);
467            drop(fts_store);
468        }
469
470        // Only process changed files
471        log_print!("\nšŸ”„ Processing {} changed files...", changed_files.len());
472        files = changed_files;
473    } else {
474        // Note: database deletion for --force is handled in get_db_path_smart()
475        // (including the delay for Windows file handle release). This else branch
476        // only runs when not in incremental mode, i.e., fresh index creation.
477    }
478
479    // Phase 2: Semantic Chunking + Embedding + Storage (Streaming)
480    // We process files one at a time to keep memory usage low
481    log_print!(
482        "\n{}",
483        "Phase 2: Semantic Chunking, Embedding & Storage".bright_cyan()
484    );
485    log_print!("{}", "-".repeat(60));
486
487    let chunking_start = Instant::now();
488    let mut chunker = SemanticChunker::new(100, 2000, 10);
489    let mut total_chunks = 0;
490
491    let pb = ProgressBar::new(files.len() as u64);
492    pb.set_style(
493        ProgressStyle::default_bar()
494            .template("[{elapsed_precise}] {bar:40.cyan/blue} {pos}/{len} {msg}")
495            .unwrap()
496            .progress_chars("ā–ˆā–“ā–’ā–‘ "),
497    );
498
499    // Initialize embedding model (uses global models cache)
500    let cache_dir = crate::constants::get_global_models_cache_dir()?;
501    let mut embedding_service =
502        EmbeddingService::with_cache_dir(model_type, Some(cache_dir.as_path()))?;
503
504    // Check for shutdown after model loading (can take 5-10 seconds)
505    if crate::constants::check_shutdown(&cancel_token) {
506        log_print!(
507            "\n{}",
508            "āš ļø  Indexing cancelled during model loading".yellow()
509        );
510        return Ok(());
511    }
512
513    // Initialize vector store
514    let mut store = VectorStore::new(&db_path, embedding_service.dimensions())?;
515
516    // Initialize FTS store
517    let mut fts_store = FtsStore::new_with_writer(&db_path)?;
518
519    // Track chunk IDs per file for metadata (memory efficient: only file paths, not chunk contents)
520    let mut file_chunks: std::collections::HashMap<String, Vec<u32>> =
521        std::collections::HashMap::new();
522
523    // Arena reset interval: periodically recreate the ONNX session to free
524    // arena allocator memory that grows monotonically. Model is on disk, so
525    let mut skipped_files = 0;
526    let mut cancelled = false;
527    for file in &files {
528        // Check for cancellation before processing each file
529        // Uses BOTH global AtomicBool (set by ctrlc OS handler) AND CancellationToken (for programmatic cancel)
530        if crate::constants::check_shutdown(&cancel_token) {
531            cancelled = true;
532            break;
533        }
534
535        pb.set_message(format!(
536            "{}",
537            file.path.file_name().unwrap().to_string_lossy()
538        ));
539
540        debug!("šŸ“„ Processing file: {}", file.path.display());
541
542        // Skip files that aren't valid UTF-8
543        let source_code = match std::fs::read_to_string(&file.path) {
544            Ok(content) => content,
545            Err(_) => {
546                debug!("āš ļø  Skipping file (invalid UTF-8): {}", file.path.display());
547                skipped_files += 1;
548                pb.inc(1);
549                continue;
550            }
551        };
552
553        // Phase 2a: Chunk this file only (memory efficient!)
554        let chunks = chunker.chunk_semantic(file.language, &file.path, &source_code)?;
555        let chunk_count = chunks.len();
556        debug!(
557            "   Created {} chunks for {}",
558            chunk_count,
559            file.path.display()
560        );
561
562        if chunks.is_empty() {
563            pb.inc(1);
564            continue;
565        }
566
567        // Phase 2b: Embed chunks for this file only (batched internally)
568        // If embedding is interrupted by CTRL-C, catch it as cancellation (not error)
569        let embedded_chunks = match embedding_service.embed_chunks(chunks) {
570            Ok(chunks) => chunks,
571            Err(_) if crate::constants::is_shutdown_requested() => {
572                cancelled = true;
573                break;
574            }
575            Err(e) => return Err(e),
576        };
577
578        // Check cancellation after embedding (most CPU-intensive step)
579        if crate::constants::check_shutdown(&cancel_token) {
580            cancelled = true;
581            break;
582        }
583
584        // Phase 2c: Extract lightweight FTS data before handing ownership to vector store.
585        // We capture just the strings needed for FTS (content, path, signature, kind)
586        // so we can pass full EmbeddedChunks to the vector store without cloning.
587        let fts_data: Vec<(String, String, Option<String>, String)> = embedded_chunks
588            .iter()
589            .map(|ec| {
590                (
591                    ec.chunk.content.clone(),
592                    ec.chunk.path.clone(),
593                    ec.chunk.signature.clone(),
594                    format!("{:?}", ec.chunk.kind),
595                )
596            })
597            .collect();
598
599        // Phase 2d: Insert into vector store (takes ownership, no clone needed)
600        let chunk_ids = store.insert_chunks_with_ids(embedded_chunks)?;
601
602        // Phase 2e: Insert into FTS with real chunk IDs from vector store.
603        // FTS failures are non-fatal: vector search is the primary search method,
604        // FTS (BM25) is supplementary for hybrid search. If tantivy encounters
605        // I/O errors (common on Windows due to antivirus interference), we log
606        // a warning and continue rather than aborting the entire indexing run.
607        for ((content, path, signature, kind), &chunk_id) in fts_data.iter().zip(chunk_ids.iter()) {
608            if let Err(e) = fts_store.add_chunk(chunk_id, content, path, signature.as_deref(), kind)
609            {
610                tracing::warn!(
611                    "FTS add_chunk failed in {}: {} (continuing without FTS for this chunk)",
612                    file.path.display(),
613                    e
614                );
615            }
616        }
617
618        // Track chunk IDs per file for metadata (only paths and IDs, not chunk content)
619        let file_path = file.path.to_string_lossy().to_string();
620        file_chunks.insert(file_path, chunk_ids.clone());
621
622        total_chunks += chunk_count;
623        pb.inc(1);
624
625        // Periodic FTS commit to flush the in-memory segment to disk in a controlled
626        // way. Non-fatal: if commit fails, we log and continue. Some FTS data may
627        // be lost but vector search (primary) is unaffected.
628        if total_chunks % 1000 == 0 && total_chunks > 0 {
629            if let Err(e) = fts_store.commit() {
630                tracing::warn!(
631                    "Periodic FTS commit failed at {} chunks: {} (continuing, some FTS data may be lost)",
632                    total_chunks,
633                    e
634                );
635            }
636        }
637
638        // Memory is freed here - chunks/embeddings dropped before next file
639    }
640
641    // Handle cancellation: exit quickly without blocking on build_index
642    if cancelled {
643        pb.finish_with_message("Cancelled!");
644        log_print!("\n{}", "āš ļø  Indexing cancelled by user".yellow());
645
646        // Free ONNX model memory immediately
647        drop(embedding_service);
648        drop(chunker);
649
650        // Don't call build_index() — it blocks for 10-30 seconds on large datasets.
651        // The database is in a partially written state, user can re-run with --force.
652        // Commit FTS with retry to avoid index corruption on shutdown.
653        if total_chunks > 0 {
654            if let Err(e) = fts_store.commit() {
655                // Log the error - best-effort commit failed
656                log_print!(
657                    "{}   FTS commit warning: {} (index may need recovery)",
658                    "āš ļø ".yellow(),
659                    e
660                );
661                log_print!(
662                    "{}   Run {} to rebuild the index cleanly if needed",
663                    "šŸ’” ".cyan(),
664                    "codesearch index -f".bright_cyan()
665                );
666            } else {
667                log_print!(
668                    "   Partial progress: {} chunks written (re-run with --force for clean index)",
669                    total_chunks
670                );
671            }
672        }
673
674        return Ok(());
675    }
676
677    // Capture model info before dropping the ONNX model
678    let model_short_name = embedding_service.model_short_name().to_string();
679    let model_name = embedding_service.model_name().to_string();
680    let model_dimensions = embedding_service.dimensions();
681
682    // Free ONNX model + arena allocator memory before final index operations
683    // This releases hundreds of MB of inference buffers
684    drop(embedding_service);
685    drop(chunker);
686
687    // Commit FTS store (non-fatal: vector search works without FTS)
688    if let Err(e) = fts_store.commit() {
689        tracing::warn!(
690            "Final FTS commit failed: {} (vector search will work, but hybrid/BM25 search may have gaps)",
691            e
692        );
693    }
694
695    if skipped_files > 0 {
696        log_print!("   āš ļø  Skipped {} files (invalid UTF-8)", skipped_files);
697    }
698
699    pb.finish_with_message("Done!");
700    let chunking_duration = chunking_start.elapsed();
701
702    log_print!(
703        "āœ… Created and indexed {} chunks in {:?}",
704        total_chunks,
705        chunking_duration
706    );
707
708    if total_chunks == 0 {
709        log_print!("\n{}", "No chunks created!".yellow());
710        return Ok(());
711    }
712
713    // Capture FTS stats before dropping the store to free memory
714    let _fts_stats = fts_store.stats()?;
715
716    // Drop FTS store before build_index() to free tantivy memory.
717    // FTS is already committed above — keeping the store open during
718    // build_index() wastes memory on tantivy's segment readers and buffers.
719    drop(fts_store);
720
721    // Build vector index (now that all chunks are inserted)
722    let storage_start = Instant::now();
723    store.build_index()?;
724    let _storage_duration = storage_start.elapsed();
725
726    // Save model metadata
727    let metadata = serde_json::json!({
728        "model_short_name": model_short_name,
729        "model_name": model_name,
730        "dimensions": model_dimensions,
731        "indexed_at": chrono::Utc::now().to_rfc3339(),
732    });
733    std::fs::write(
734        db_path.join("metadata.json"),
735        serde_json::to_string_pretty(&metadata)?,
736    )?;
737
738    // Update FileMetaStore with new chunk IDs (incremental mode)
739    if is_incremental {
740        // IMPORTANT: Reuse the existing file_meta_store that already contains unchanged files!
741        // Don't create a new one - that would lose all unchanged file metadata
742        let mut file_meta_store = file_meta_store.take().unwrap();
743
744        // Save FileMetaStore count before moving
745        let file_count = file_chunks.len();
746
747        // Update FileMetaStore with new/changed files (unchanged files are already preserved)
748        for (file_path, chunk_ids) in file_chunks {
749            file_meta_store.update_file(Path::new(&file_path), chunk_ids)?;
750        }
751
752        // Save FileMetaStore (includes both unchanged + updated files)
753        file_meta_store.save(&db_path)?;
754
755        log_print!(
756            "āœ… Updated metadata for {} changed files (unchanged files preserved)",
757            file_count
758        );
759    } else {
760        // In full index mode, create a fresh FileMetaStore
761        let mut file_meta_store =
762            FileMetaStore::new(model_type.name().to_string(), model_type.dimensions());
763
764        // Update FileMetaStore
765        for (file_path, chunk_ids) in file_chunks {
766            file_meta_store.update_file(Path::new(&file_path), chunk_ids)?;
767        }
768
769        // Save FileMetaStore
770        file_meta_store.save(&db_path)?;
771    }
772
773    // Show final stats
774    let db_stats = store.stats()?;
775    log_print!("\n{}", "šŸ“Š Final Statistics".bright_green().bold());
776    log_print!("{}", "=".repeat(60));
777    log_print!("   Total chunks: {}", db_stats.total_chunks);
778    log_print!("   Total files: {}", db_stats.total_files);
779    log_print!(
780        "   Indexed: {}",
781        if db_stats.indexed {
782            "āœ… Yes"
783        } else {
784            "āŒ No"
785        }
786    );
787
788    // Calculate database size
789    let mut total_size = 0u64;
790    for entry in std::fs::read_dir(&db_path)? {
791        let entry = entry?;
792        total_size += entry.metadata()?.len();
793    }
794    log_print!(
795        "   Database size: {:.2} MB",
796        total_size as f64 / (1024.0 * 1024.0)
797    );
798
799    log_print!("\n{}", "✨ Indexing complete".bright_green().bold());
800    log_print!(
801        "   Run {} to search your codebase",
802        "codesearch search <query>".bright_cyan()
803    );
804
805    Ok(())
806}
807
808/// List all indexed repositories
809#[allow(dead_code)] // Reserved for 'list' command implementation
810pub async fn list() -> Result<()> {
811    println!("{}", "šŸ“š Indexed Repositories".bright_cyan().bold());
812    println!("{}", "=".repeat(60));
813
814    // TODO: Scan all repositories in ~/.codesearch/repos.json
815    // For now just check current directory
816
817    // Check current directory
818    let current_dir = std::env::current_dir()?;
819    let current_db = current_dir.join(".codesearch.db");
820
821    if current_db.exists() {
822        println!("\n{}", "Current Directory:".bright_green());
823        print_repo_stats(&current_dir, &current_db)?;
824    }
825
826    // TODO: Track indexed repositories globally in ~/.codesearch/repos.json
827    // For now, just show current directory
828
829    Ok(())
830}
831
832/// Show statistics about the vector database
833pub async fn stats(path: Option<PathBuf>) -> Result<()> {
834    let (db_path, project_path) = get_db_path(path)?;
835
836    if !db_path.exists() {
837        println!("{}", "āŒ No database found!".red());
838        println!("   Run {} first", "codesearch index".bright_cyan());
839        return Ok(());
840    }
841
842    println!("{}", "šŸ“Š Database Statistics".bright_cyan().bold());
843    println!("{}", "=".repeat(60));
844    println!("šŸ’¾ Database: {}", db_path.display());
845    println!("šŸ“‚ Project: {}", project_path.display());
846
847    let store = VectorStore::new(&db_path, 384)?; // We'll need to store dimensions in metadata
848    let stats = store.stats()?;
849
850    println!("\n{}", "Vector Store:".bright_green());
851    println!("   Total chunks: {}", stats.total_chunks);
852    println!("   Total files: {}", stats.total_files);
853    println!(
854        "   Indexed: {}",
855        if stats.indexed { "āœ… Yes" } else { "āŒ No" }
856    );
857    println!("   Dimensions: {}", stats.dimensions);
858
859    // Calculate database size
860    let mut total_size = 0u64;
861    for entry in std::fs::read_dir(&db_path)? {
862        let entry = entry?;
863        total_size += entry.metadata()?.len();
864    }
865
866    println!("\n{}", "Storage:".bright_green());
867    println!(
868        "   Database size: {:.2} MB",
869        total_size as f64 / (1024.0 * 1024.0)
870    );
871    println!(
872        "   Avg per chunk: {:.2} KB",
873        (total_size as f64 / stats.total_chunks as f64) / 1024.0
874    );
875
876    Ok(())
877}
878
879/// Clear the vector database
880pub async fn clear(path: Option<PathBuf>, yes: bool) -> Result<()> {
881    let (db_path, project_path) = get_db_path(path)?;
882
883    if !db_path.exists() {
884        println!("{}", "āŒ No database found!".red());
885        return Ok(());
886    }
887
888    println!("{}", "šŸ—‘ļø  Clear Database".bright_yellow().bold());
889    println!("{}", "=".repeat(60));
890    println!("šŸ’¾ Database: {}", db_path.display());
891    println!("šŸ“‚ Project: {}", project_path.display());
892
893    if !yes {
894        println!("\n{}", "āš ļø  This will delete all indexed data!".yellow());
895        print!("Are you sure? (y/N): ");
896        use std::io::{self, Write};
897        io::stdout().flush()?;
898
899        let mut input = String::new();
900        io::stdin().read_line(&mut input)?;
901
902        if !input.trim().eq_ignore_ascii_case("y") {
903            println!("{}", "Cancelled.".dimmed());
904            return Ok(());
905        }
906    }
907
908    println!("\nšŸ”„ Removing database...");
909    std::fs::remove_dir_all(&db_path)?;
910
911    println!("{}", "āœ… Database cleared!".green());
912
913    Ok(())
914}
915
916/// Helper to print repository stats
917#[allow(dead_code)] // Used by list() function
918fn print_repo_stats(repo_path: &Path, db_path: &Path) -> Result<()> {
919    println!("   šŸ“‚ {}", repo_path.display());
920
921    // Try to load stats
922    match VectorStore::new(db_path, 384) {
923        Ok(store) => match store.stats() {
924            Ok(stats) => {
925                println!(
926                    "      {} chunks in {} files",
927                    stats.total_chunks, stats.total_files
928                );
929            }
930            Err(_) => {
931                println!("      {}", "Could not load stats".dimmed());
932            }
933        },
934        Err(_) => {
935            println!("      {}", "Could not open database".dimmed());
936        }
937    }
938
939    Ok(())
940}
941
942/// Add a repository to the index (creates local or global)
943pub async fn add_to_index(
944    path: Option<PathBuf>,
945    global: bool,
946    cancel_token: CancellationToken,
947) -> Result<()> {
948    let project_path = path.as_deref().unwrap_or_else(|| Path::new("."));
949    let canonical_path = project_path.canonicalize()?;
950
951    println!("{}", "āž• Add to Index".bright_green().bold());
952    println!("{}", "=".repeat(60));
953    println!("šŸ“‚ Project: {}", canonical_path.display());
954
955    // Check if ANY index exists (current directory OR parent directories OR global)
956    let db_info = find_best_database(path.as_deref())?;
957
958    if let Some(db) = db_info {
959        println!("\n{}", "āš ļø  An index already exists!".yellow());
960        println!("\n{}", "Existing Index:".cyan());
961        println!("   Path: {}", db.db_path.display());
962
963        if db.is_global {
964            println!("   Type: {}", "Global".bright_green());
965        } else if !db.is_current {
966            println!("   Type: {} (parent directory)", "Local".bright_green());
967        } else {
968            println!("   Type: {}", "Local".bright_green());
969        }
970
971        println!(
972            "\n{}",
973            "You cannot create a separate index for a subdirectory.".yellow()
974        );
975        println!(
976            "{}",
977            if db.is_global {
978                "The global index will be used for all projects."
979            } else if !db.is_current {
980                "The parent directory index will be used for this subdirectory."
981            } else {
982                "An index already exists for this project."
983            }
984        );
985
986        println!("\n{}", "To use the existing index, simply run:".cyan());
987        println!("  codesearch index");
988
989        return Err(anyhow::anyhow!(
990            "Index already exists in parent or current directory"
991        ));
992    }
993
994    // Check if any index already exists for THIS directory (not parent)
995    let local_db = canonical_path.join(".codesearch.db");
996    let has_local = local_db.exists();
997
998    let repos_path = dirs::home_dir()
999        .ok_or_else(|| anyhow::anyhow!("Could not determine home directory"))?
1000        .join(".codesearch")
1001        .join("repos.json");
1002
1003    let has_global = if repos_path.exists() {
1004        let content = fs::read_to_string(&repos_path)?;
1005        if let Ok(repos) =
1006            serde_json::from_str::<std::collections::HashMap<String, serde_json::Value>>(&content)
1007        {
1008            repos.contains_key(canonical_path.to_str().unwrap_or(""))
1009        } else {
1010            false
1011        }
1012    } else {
1013        false
1014    };
1015
1016    // Conflict checks
1017    if global && has_local {
1018        println!("\n{}", "āŒ Error: Local index already exists!".red());
1019        println!("   A local index already exists at: {}", local_db.display());
1020        println!("   Remove it first with: codesearch index rm");
1021        return Err(anyhow::anyhow!("Local index exists"));
1022    }
1023
1024    if has_local || has_global {
1025        println!(
1026            "\n{}",
1027            "āš ļø  Index already exists for this project!".yellow()
1028        );
1029        println!("   Local: {}", if has_local { "āœ…" } else { "āŒ" });
1030        println!("   Global: {}", if has_global { "āœ…" } else { "āŒ" });
1031        return Ok(());
1032    }
1033
1034    // Create the index
1035    if global {
1036        println!("\n{}", "Creating global index...".cyan());
1037        index(
1038            Some(canonical_path.clone()),
1039            false,
1040            false,
1041            true,
1042            None,
1043            cancel_token.clone(),
1044        )
1045        .await?;
1046        println!("\n{}", "āœ… Global index created!".green());
1047    } else {
1048        println!("\n{}", "Creating local index...".cyan());
1049        index(
1050            Some(canonical_path.clone()),
1051            false,
1052            false,
1053            false,
1054            None,
1055            cancel_token,
1056        )
1057        .await?;
1058        println!("\n{}", "āœ… Local index created!".green());
1059    }
1060
1061    Ok(())
1062}
1063
1064/// Remove the index (local or global, auto-detected)
1065pub async fn remove_from_index(path: Option<PathBuf>) -> Result<()> {
1066    let project_path = path.unwrap_or_else(|| PathBuf::from("."));
1067    let canonical_path = project_path.canonicalize()?;
1068
1069    println!("{}", "āž– Remove Index".bright_red().bold());
1070    println!("{}", "=".repeat(60));
1071    println!("šŸ“‚ Project: {}", canonical_path.display());
1072
1073    // Check what exists
1074    let local_db = canonical_path.join(".codesearch.db");
1075    let has_local = local_db.exists();
1076
1077    let repos_path = dirs::home_dir()
1078        .ok_or_else(|| anyhow::anyhow!("Could not determine home directory"))?
1079        .join(".codesearch")
1080        .join("repos.json");
1081
1082    let has_global = if repos_path.exists() {
1083        let content = fs::read_to_string(&repos_path)?;
1084        if let Ok(repos) =
1085            serde_json::from_str::<std::collections::HashMap<String, serde_json::Value>>(&content)
1086        {
1087            repos.contains_key(canonical_path.to_str().unwrap_or(""))
1088        } else {
1089            false
1090        }
1091    } else {
1092        false
1093    };
1094
1095    if !has_local && !has_global {
1096        println!("\n{}", "āš ļø  No index found for this project.".yellow());
1097        return Ok(());
1098    }
1099
1100    // If both exist (shouldn't happen), remove local with warning
1101    if has_local && has_global {
1102        println!(
1103            "\n{}",
1104            "āš ļø  Warning: Both local and global indexes exist!".yellow()
1105        );
1106        println!("   Removing local index...");
1107        fs::remove_dir_all(&local_db)?;
1108        println!("   {}", "āœ… Local index removed".green());
1109        println!("   (Global index remains)");
1110        return Ok(());
1111    }
1112
1113    // Remove whichever exists
1114    if has_local {
1115        println!("\n{}", "Removing local index...".cyan());
1116        // Note: fastembed cache is inside .codesearch.db/fastembed_cache, so it's removed automatically
1117        fs::remove_dir_all(&local_db)?;
1118        println!("{}", "āœ… Local index removed!".green());
1119    } else if has_global {
1120        println!("\n{}", "Removing global index...".cyan());
1121        unregister_repository(&canonical_path)?;
1122        println!("{}", "āœ… Global index removed!".green());
1123    }
1124
1125    Ok(())
1126}
1127
1128/// Show index status (local or global)
1129pub async fn list_index_status() -> Result<()> {
1130    println!("{}", "šŸ“‹ Index Status".bright_cyan().bold());
1131    println!("{}", "=".repeat(60));
1132
1133    // Try to find the database
1134    let db_info = find_best_database(Some(Path::new(".")))?;
1135
1136    if let Some(db) = db_info {
1137        println!("\n{}", "šŸ’¾ Database:".cyan());
1138        println!("   Path: {}", db.db_path.display());
1139
1140        if db.is_global {
1141            println!("   Type: {}", "Global".bright_green());
1142        } else {
1143            println!("   Type: {}", "Local".bright_green());
1144        }
1145
1146        // Show if this is from a parent directory
1147        if !db.is_current && !db.is_global {
1148            println!("   {}", "(from parent directory)".dimmed());
1149        }
1150
1151        // Get stats
1152        if let Ok(stats) = get_db_stats(&db.db_path).await {
1153            println!("   Status: {}", "āœ… Indexed".green());
1154            println!("   Chunks: {}", stats.chunk_count);
1155            println!("   Size: {:.2} MB", stats.size_mb);
1156        } else {
1157            println!("   Status: {}", "āš ļø  Could not read database".yellow());
1158        }
1159    } else {
1160        println!("\n{}", "No index found for this project.".dimmed());
1161        println!("\nCreate an index with:");
1162        println!("  codesearch index add          # Create local index");
1163        println!("  codesearch index add -g       # Create global index");
1164    }
1165
1166    Ok(())
1167}
1168
1169async fn get_db_stats(db_path: &Path) -> Result<DbStats> {
1170    use crate::vectordb::VectorStore;
1171
1172    if !db_path.exists() {
1173        return Ok(DbStats {
1174            chunk_count: 0,
1175            size_mb: 0.0,
1176        });
1177    }
1178
1179    // Try to get stats from vector store
1180    let store = VectorStore::new(db_path, 384)?;
1181    let stats = store.stats()?;
1182
1183    // Calculate database size
1184    let mut total_size = 0u64;
1185    for entry in std::fs::read_dir(db_path)? {
1186        let entry = entry?;
1187        total_size += entry.metadata()?.len();
1188    }
1189
1190    Ok(DbStats {
1191        chunk_count: stats.total_chunks,
1192        size_mb: total_size as f64 / (1024.0 * 1024.0),
1193    })
1194}
1195
1196struct DbStats {
1197    chunk_count: usize,
1198    size_mb: f64,
1199}