vyctor 0.1.0

A fast CLI tool for semantic file search using vector embeddings
Documentation
//! File indexing functionality

pub mod ast_chunker;
mod chunker;
mod file_walker;
mod hasher;
pub mod language;
pub mod regex_chunker;

pub use chunker::{Chunk, Chunker};
pub use file_walker::FileWalker;
pub use hasher::content_hash;

// Re-export for library users
#[allow(unused_imports)]
pub use chunker::ChunkType;
#[allow(unused_imports)]
pub use language::{detect_language, detect_language_from_str, Language};

use crate::config::VyctorConfig;
use crate::embeddings::{create_provider, EmbeddingProvider};
use crate::storage::Storage;
use anyhow::{Context, Result};
use indicatif::{ProgressBar, ProgressStyle};
use rayon::prelude::*;
use std::collections::HashSet;
use std::path::Path;
use std::sync::Arc;

/// Main indexer that orchestrates the indexing process
pub struct Indexer {
    storage: Storage,
    embedder: Arc<dyn EmbeddingProvider>,
    chunker: Chunker,
    file_walker: FileWalker,
    batch_size: usize,
}

impl Indexer {
    /// Create a new indexer from configuration
    pub fn new(root: &Path, config: &VyctorConfig) -> Result<Self> {
        let db_path = root.join(".vyctor").join("index.duckdb");
        let storage = Storage::new(&db_path, config.embedding.dimensions)?;
        // Always show verbose output during indexing since it's a longer operation
        let embedder = create_provider(&config.embedding, true)?;
        let chunker = Chunker::with_options(
            config.indexing.chunk_size,
            config.indexing.chunk_overlap,
            config.indexing.max_chunk_size,
            config.indexing.semantic_chunking,
        );
        let file_walker = FileWalker::new(
            root.to_path_buf(),
            config.indexing.include.clone(),
            config.indexing.exclude.clone(),
        );

        Ok(Self {
            storage,
            embedder,
            chunker,
            file_walker,
            batch_size: config.embedding.batch_size,
        })
    }

    /// Index all files, returning the number of files indexed
    pub async fn index_all(&self, force: bool) -> Result<IndexResult> {
        let files: Vec<_> = self.file_walker.walk().collect();
        let total_files = files.len();

        if total_files == 0 {
            return Ok(IndexResult {
                files_indexed: 0,
                files_skipped: 0,
                files_deleted: 0,
                chunks_created: 0,
            });
        }

        let pb = ProgressBar::new(total_files as u64);
        pb.set_style(
            ProgressStyle::default_bar()
                .template(
                    "{spinner:.green} [{bar:40.cyan/blue}] {pos}/{len} files ({elapsed}) {msg}",
                )
                .unwrap()
                .progress_chars("#>-"),
        );

        let mut total_embed_time = std::time::Duration::ZERO;

        let mut files_indexed = 0;
        let mut files_skipped = 0;
        let mut chunks_created = 0;

        // Get existing file hashes for skip detection (single DB query upfront)
        let existing_hashes: std::collections::HashMap<String, String> = if !force {
            self.storage.get_all_file_hashes()?.into_iter().collect()
        } else {
            std::collections::HashMap::new()
        };

        // Clone what we need for parallel processing
        let root = self.file_walker.root().to_path_buf();
        let chunker = self.chunker.clone();

        // Process files in batches
        for file_batch in files.chunks(self.batch_size) {
            pb.set_message("Reading & chunking files...");

            // Parallel file reading, hashing, and chunking (no DB access)
            let processed: Vec<_> = file_batch
                .par_iter()
                .filter_map(|file_path| {
                    let relative_path = file_path
                        .strip_prefix(&root)
                        .unwrap_or(file_path)
                        .to_string_lossy()
                        .to_string();

                    // Read file content
                    let content = match std::fs::read_to_string(file_path) {
                        Ok(c) => c,
                        Err(_) => return None,
                    };

                    let hash = content_hash(&content);

                    // Check if unchanged (using pre-fetched hashes)
                    if !force {
                        if let Some(existing_hash) = existing_hashes.get(&relative_path) {
                            if existing_hash == &hash {
                                return Some(Err(relative_path)); // Signal: skipped
                            }
                        }
                    }

                    // Chunk the content with file path for language detection
                    let chunks = chunker.chunk_with_path(&content, Some(&relative_path));
                    if chunks.is_empty() {
                        return None;
                    }

                    Some(Ok(ProcessedFile {
                        relative_path,
                        hash,
                        chunks,
                    }))
                })
                .collect();

            // Count skipped and collect files to process
            let mut to_embed: Vec<ProcessedFile> = Vec::new();
            for result in processed {
                match result {
                    Ok(pf) => to_embed.push(pf),
                    Err(_) => {
                        files_skipped += 1;
                        pb.inc(1);
                    }
                }
            }

            if to_embed.is_empty() {
                continue;
            }

            // Upsert files to get IDs (sequential for DB)
            let mut files_with_ids: Vec<(String, i64, Vec<Chunk>)> = Vec::new();
            for pf in to_embed {
                match self.storage.upsert_file(&pf.relative_path, &pf.hash) {
                    Ok(file_id) => {
                        files_with_ids.push((pf.relative_path, file_id, pf.chunks));
                    }
                    Err(e) => {
                        eprintln!("Warning: Could not upsert {}: {}", pf.relative_path, e);
                        pb.inc(1);
                    }
                }
            }

            // Embed and save chunks per file (provides incremental progress)
            if !files_with_ids.is_empty() {
                const EMBED_BATCH_SIZE: usize = 32; // Smaller batches for better progress feedback

                for (_, file_id, chunks) in &files_with_ids {
                    let texts: Vec<_> = chunks.iter().map(|c| c.content.clone()).collect();

                    // Process this file's chunks in micro-batches
                    let mut all_embeddings = Vec::with_capacity(texts.len());
                    for (batch_idx, text_batch) in texts.chunks(EMBED_BATCH_SIZE).enumerate() {
                        let batch_num = batch_idx + 1;
                        let total_batches = texts.len().div_ceil(EMBED_BATCH_SIZE);
                        if total_batches > 1 {
                            pb.set_message(format!(
                                "Embedding batch {}/{} ({} chunks)...",
                                batch_num,
                                total_batches,
                                text_batch.len()
                            ));
                        } else {
                            pb.set_message(format!("Embedding {} chunks...", text_batch.len()));
                        }

                        let embed_start = std::time::Instant::now();
                        let batch_embeddings = self.embedder.embed_batch(text_batch).await?;
                        total_embed_time += embed_start.elapsed();
                        all_embeddings.extend(batch_embeddings);
                    }

                    // Save chunks with embeddings
                    for (chunk_idx, (chunk, result)) in
                        chunks.iter().zip(all_embeddings.iter()).enumerate()
                    {
                        self.storage.insert_chunk(
                            *file_id,
                            chunk_idx as i32,
                            &chunk.content,
                            chunk.start_line as i32,
                            chunk.end_line as i32,
                            &result.embedding,
                        )?;
                        chunks_created += 1;
                    }
                    files_indexed += 1;
                    pb.inc(1);
                }
            }
        }

        pb.finish_with_message(format!("done (embedding: {:?})", total_embed_time));

        // Remove deleted files
        let indexed_paths: HashSet<_> = self.storage.get_all_file_paths()?.into_iter().collect();

        let current_paths: HashSet<_> = files
            .iter()
            .filter_map(|p| {
                p.strip_prefix(self.file_walker.root())
                    .ok()
                    .map(|p| p.to_string_lossy().to_string())
            })
            .collect();

        let deleted_paths: Vec<_> = indexed_paths.difference(&current_paths).cloned().collect();

        let files_deleted = self.storage.delete_files(&deleted_paths)?;

        Ok(IndexResult {
            files_indexed,
            files_skipped,
            files_deleted,
            chunks_created,
        })
    }

    /// Sync changed files only
    pub async fn sync(&self) -> Result<IndexResult> {
        self.index_all(false).await
    }

    /// Force re-index all files
    pub async fn reindex(&self) -> Result<IndexResult> {
        self.storage.clear()?;
        self.index_all(true).await
    }

    /// Index a single file
    pub async fn index_file(&self, path: &Path) -> Result<bool> {
        let relative_path = path
            .strip_prefix(self.file_walker.root())
            .unwrap_or(path)
            .to_string_lossy()
            .to_string();

        // Read file content
        let content = std::fs::read_to_string(path)
            .with_context(|| format!("Failed to read file: {}", path.display()))?;

        let hash = content_hash(&content);

        // Check if file needs re-indexing
        if let Ok(Some(existing)) = self.storage.get_file(&relative_path) {
            if existing.content_hash == hash {
                return Ok(false);
            }
        }

        // Chunk the content with file path for language detection
        let chunks = self.chunker.chunk_with_path(&content, Some(&relative_path));
        if chunks.is_empty() {
            return Ok(false);
        }

        // Upsert file
        let file_id = self.storage.upsert_file(&relative_path, &hash)?;

        // Embed and insert chunks
        let texts: Vec<_> = chunks.iter().map(|c| c.content.clone()).collect();
        let embeddings = self.embedder.embed_batch(&texts).await?;

        for (chunk_idx, (chunk, result)) in chunks.iter().zip(embeddings.iter()).enumerate() {
            self.storage.insert_chunk(
                file_id,
                chunk_idx as i32,
                &chunk.content,
                chunk.start_line as i32,
                chunk.end_line as i32,
                &result.embedding,
            )?;
        }

        Ok(true)
    }

    /// Remove a file from the index
    pub fn remove_file(&self, path: &Path) -> Result<bool> {
        let relative_path = path
            .strip_prefix(self.file_walker.root())
            .unwrap_or(path)
            .to_string_lossy()
            .to_string();

        self.storage.delete_file(&relative_path)
    }

    /// Get the storage instance (for search operations)
    #[allow(dead_code)]
    pub fn storage(&self) -> &Storage {
        &self.storage
    }

    /// Get the embedder (for search operations)
    #[allow(dead_code)]
    pub fn embedder(&self) -> &Arc<dyn EmbeddingProvider> {
        &self.embedder
    }
}

/// Result of an indexing operation
#[derive(Debug, Clone)]
pub struct IndexResult {
    pub files_indexed: usize,
    pub files_skipped: usize,
    pub files_deleted: usize,
    pub chunks_created: usize,
}

/// Intermediate result from parallel file processing
struct ProcessedFile {
    relative_path: String,
    hash: String,
    chunks: Vec<Chunk>,
}