zeph-index 0.19.0

// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
// SPDX-License-Identifier: MIT OR Apache-2.0

//! Project indexing orchestrator: walk → chunk → embed → store.
//!
//! The top-level type is [`CodeIndexer`]. It drives a full project index via
//! [`CodeIndexer::index_project`] and supports incremental updates via
//! [`CodeIndexer::reindex_file`] (called by the file watcher).
//!
//! ## Concurrency model
//!
//! Files are processed in two nested loops:
//!
//! 1. **Memory batches** — files are split into groups of
//!    [`IndexerConfig::memory_batch_size`] to bound peak in-flight state.
//! 2. **Per-batch concurrency** — within each memory batch, files are processed
//!    concurrently up to [`IndexerConfig::embed_concurrency`] using
//!    `futures::stream::buffer_unordered`.
//!
//! Chunks that already exist in the store (matched by content hash) are skipped
//! without any embedding call, making re-runs over an unchanged project O(1) in
//! LLM API cost.

use std::collections::HashSet;
use std::path::Path;
use std::sync::Arc;

use futures::StreamExt as _;
use tokio::sync::watch;

use crate::chunker::{ChunkerConfig, CodeChunk, chunk_file};
use crate::context::contextualize_for_embedding;
use crate::error::{IndexError, Result};
use crate::languages::{detect_language, is_indexable};
use crate::store::{ChunkInsert, CodeStore};
use zeph_llm::any::AnyProvider;
use zeph_llm::provider::LlmProvider;

/// Configuration for [`CodeIndexer`].
///
/// All fields have reasonable defaults via [`Default`]. Override individual fields
/// when you need to tune throughput, memory use, or API rate limits.
///
/// # Examples
///
/// ```no_run
/// use zeph_index::indexer::IndexerConfig;
///
/// let config = IndexerConfig::default();
/// assert_eq!(config.concurrency, 4);
/// assert_eq!(config.embed_concurrency, 2);
///
/// // High-throughput mode for a fast local embedding server.
/// let fast = IndexerConfig {
///     embed_concurrency: 8,
///     memory_batch_size: 64,
///     ..IndexerConfig::default()
/// };
/// ```
#[derive(Debug, Clone)]
pub struct IndexerConfig {
    /// Chunker configuration controlling chunk size thresholds.
    pub chunker: ChunkerConfig,
    /// Number of files to process concurrently within each memory batch. Default: 4.
    pub concurrency: usize,
    /// Maximum number of new chunks to upsert per Qdrant call. Default: 32.
    ///
    /// Larger values reduce round-trips but increase per-call memory.
    pub batch_size: usize,
    /// Number of files per outer memory batch during initial indexing. Default: 32.
    ///
    /// Lowering this reduces peak heap usage at the cost of more `yield_now` calls.
    pub memory_batch_size: usize,
    /// Maximum file size in bytes. Files larger than this are silently skipped. Default: 512 KiB.
    ///
    /// Large files (e.g. generated code, vendored libraries) rarely provide useful
    /// retrieval signal and are expensive to embed.
    pub max_file_bytes: usize,
    /// Maximum parallel `embed_batch` calls per memory batch. Default: 2.
    ///
    /// Keep this low when using hosted embedding APIs with strict TPM rate limits.
    pub embed_concurrency: usize,
}

impl Default for IndexerConfig {
    fn default() -> Self {
        Self {
            chunker: ChunkerConfig::default(),
            concurrency: 4,
            batch_size: 32,
            memory_batch_size: 32,
            max_file_bytes: 512 * 1024,
            embed_concurrency: 2,
        }
    }
}

/// Snapshot of indexing progress, sent through a [`tokio::sync::watch`] channel.
///
/// The caller passes an `Option<&watch::Sender<IndexProgress>>` to
/// [`CodeIndexer::index_project`]. Each time a file completes the sender receives an
/// updated snapshot so the TUI or CLI can display a live progress bar.
///
/// # Examples
///
/// ```no_run
/// use tokio::sync::watch;
/// use zeph_index::indexer::IndexProgress;
///
/// let (tx, mut rx) = watch::channel(IndexProgress::default());
/// tx.send(IndexProgress { files_done: 1, files_total: 10, chunks_created: 5 }).unwrap();
/// assert_eq!(rx.borrow().files_done, 1);
/// ```
#[derive(Debug, Clone, Default)]
pub struct IndexProgress {
    /// Number of files fully processed so far.
    pub files_done: usize,
    /// Total number of indexable files discovered in the project root.
    pub files_total: usize,
    /// Cumulative number of new chunks created across all processed files.
    pub chunks_created: usize,
}

/// Summary statistics produced at the end of a full [`CodeIndexer::index_project`] run.
///
/// Errors are collected rather than short-circuiting so the majority of the project
/// is indexed even when individual files fail (e.g. due to transient IO errors or
/// unsupported encodings).
#[derive(Debug, Default)]
pub struct IndexReport {
    /// Total number of files visited by the directory walker.
    pub files_scanned: usize,
    /// Number of files that produced at least one new chunk.
    pub files_indexed: usize,
    /// New chunks embedded and upserted into Qdrant.
    pub chunks_created: usize,
    /// Chunks skipped because an identical content hash already exists in the store.
    pub chunks_skipped: usize,
    /// Chunks deleted from the store because their file was removed from the project.
    pub chunks_removed: usize,
    /// Per-file error messages collected during the run.
    pub errors: Vec<String>,
    /// Wall-clock duration of the entire run in milliseconds.
    pub duration_ms: u64,
}

/// Orchestrates code indexing over a project tree.
///
/// `CodeIndexer` is the primary driver of the indexing pipeline. It walks the file
/// tree, delegates per-file work to `FileIndexWorker`, and coordinates the Qdrant +
/// `SQLite` writes via [`CodeStore`].
///
/// # Cloning and concurrency
///
/// `CodeIndexer` is **not** `Clone` — it is typically wrapped in an [`Arc`] and shared
/// between the initial indexing task and the file watcher.
///
/// # Examples
///
/// ```no_run
/// use std::sync::Arc;
/// use zeph_index::indexer::{CodeIndexer, IndexerConfig};
/// use zeph_index::store::CodeStore;
/// # async fn example() -> zeph_index::Result<()> {
/// # let store: CodeStore = panic!("placeholder");
/// # let provider: Arc<zeph_llm::any::AnyProvider> = panic!("placeholder");
///
/// let indexer = CodeIndexer::new(store, provider, IndexerConfig::default());
/// let report = indexer.index_project(std::path::Path::new("."), None).await?;
/// println!("indexed {} files in {}ms", report.files_indexed, report.duration_ms);
/// # Ok(())
/// # }
/// ```
pub struct CodeIndexer {
    store: CodeStore,
    provider: Arc<AnyProvider>,
    config: IndexerConfig,
}

impl CodeIndexer {
    /// Create a new `CodeIndexer`.
    ///
    /// The `store` and `provider` are cloned cheaply (reference-counted) across
    /// the concurrent file-processing tasks.
    #[must_use]
    pub fn new(store: CodeStore, provider: Arc<AnyProvider>, config: IndexerConfig) -> Self {
        Self {
            store,
            provider,
            config,
        }
    }

    /// Full project indexing with incremental change detection.
    ///
    /// # Errors
    ///
    /// Returns an error if the embedding probe or collection setup fails.
    #[allow(clippy::too_many_lines)]
    pub async fn index_project(
        &self,
        root: &Path,
        progress_tx: Option<&watch::Sender<IndexProgress>>,
    ) -> Result<IndexReport> {
        let start = std::time::Instant::now();
        let mut report = IndexReport::default();

        let probe = self.provider.embed("probe").await?;
        let vector_size = u64::try_from(probe.len())?;
        self.store.ensure_collection(vector_size).await?;

        let root_buf = root.to_path_buf();
        let (entries, current_files) = tokio::task::spawn_blocking(move || {
            let entries: Vec<_> = ignore::WalkBuilder::new(&root_buf)
                .hidden(true)
                .git_ignore(true)
                .build()
                .flatten()
                .filter(|e| e.file_type().is_some_and(|ft| ft.is_file()) && is_indexable(e.path()))
                .collect();

            let mut current_files: HashSet<String> = HashSet::new();
            for entry in &entries {
                let rel_path = entry
                    .path()
                    .strip_prefix(&root_buf)
                    .unwrap_or(entry.path())
                    .to_string_lossy()
                    .to_string();
                current_files.insert(rel_path);
            }
            (entries, current_files)
        })
        .await
        .map_err(|e| IndexError::Other(format!("directory walk panicked: {e}")))?;

        let total = entries.len();
        tracing::info!(total, "indexing started");

        let concurrency = self.config.embed_concurrency.max(1);
        let memory_batch_size = self.config.memory_batch_size.max(1);
        let mut files_done = 0usize;

        for batch in entries.chunks(memory_batch_size) {
            let store = self.store.clone();
            let provider = Arc::clone(&self.provider);
            let config = self.config.clone();

            // Resolve paths eagerly so the async closures below have no lifetime dependency on
            // `entries` or `root`.
            let file_pairs: Vec<(String, std::path::PathBuf)> = batch
                .iter()
                .map(|entry| {
                    let rel = entry
                        .path()
                        .strip_prefix(root)
                        .unwrap_or(entry.path())
                        .to_string_lossy()
                        .to_string();
                    let abs = entry.path().to_path_buf();
                    (rel, abs)
                })
                .collect();

            let mut stream =
                futures::stream::iter(file_pairs.into_iter().map(|(rel_path, abs_path)| {
                    let store = store.clone();
                    let provider = Arc::clone(&provider);
                    let config = config.clone();
                    async move {
                        let worker = FileIndexWorker {
                            store,
                            provider,
                            config,
                        };
                        let result = worker.index_file(&abs_path, &rel_path).await;
                        (rel_path, result)
                    }
                }))
                .buffer_unordered(concurrency);

            while let Some((rel_path, outcome)) = stream.next().await {
                report.files_scanned += 1;
                files_done += 1;
                match outcome {
                    Ok((created, skipped)) => {
                        if created > 0 {
                            report.files_indexed += 1;
                        }
                        report.chunks_created += created;
                        report.chunks_skipped += skipped;
                        tracing::info!(
                            file = %rel_path,
                            progress = format_args!("{files_done}/{total}"),
                            created,
                            skipped,
                        );
                    }
                    Err(e) => {
                        report.errors.push(format!("{rel_path}: {e:#}"));
                    }
                }
                if let Some(tx) = progress_tx {
                    let _ = tx.send(IndexProgress {
                        files_done,
                        files_total: total,
                        chunks_created: report.chunks_created,
                    });
                }
            }

            // Drop stream to release all in-flight future state before the next batch.
            drop(stream);
            tokio::task::yield_now().await;
        }

        let indexed = self.store.indexed_files().await?;
        for old_file in &indexed {
            if !current_files.contains(old_file) {
                match self.store.remove_file_chunks(old_file).await {
                    Ok(n) => report.chunks_removed += n,
                    Err(e) => report.errors.push(format!("cleanup {old_file}: {e:#}")),
                }
            }
        }

        report.duration_ms = start.elapsed().as_millis().try_into().unwrap_or(u64::MAX);
        Ok(report)
    }

    /// Re-index a specific file (for file watcher).
    ///
    /// # Errors
    ///
    /// Returns an error if reading, chunking, or embedding fails.
    pub async fn reindex_file(&self, root: &Path, abs_path: &Path) -> Result<usize> {
        let rel_path = abs_path
            .strip_prefix(root)
            .unwrap_or(abs_path)
            .to_string_lossy()
            .to_string();

        self.store.remove_file_chunks(&rel_path).await?;
        let worker = FileIndexWorker {
            store: self.store.clone(),
            provider: Arc::clone(&self.provider),
            config: self.config.clone(),
        };
        let (created, _) = worker.index_file(abs_path, &rel_path).await?;
        Ok(created)
    }
}

/// Per-file indexing worker — cloneable and `Send` so it can run inside `buffer_unordered`.
struct FileIndexWorker {
    store: CodeStore,
    provider: Arc<AnyProvider>,
    config: IndexerConfig,
}

impl FileIndexWorker {
    /// Embed and upsert all new chunks from a single file.
    ///
    /// New chunks (those not already in the store) are accumulated, embedded in order, and
    /// upserted in a single batch call to minimise round-trips to `Qdrant` and `SQLite`.
    async fn index_file(&self, abs_path: &Path, rel_path: &str) -> Result<(usize, usize)> {
        let metadata = tokio::fs::metadata(abs_path).await?;
        if metadata.len() > self.config.max_file_bytes as u64 {
            tracing::debug!(
                file = %abs_path.display(),
                size = metadata.len(),
                "skipping oversized file"
            );
            return Ok((0, 0));
        }
        let source = tokio::fs::read_to_string(abs_path).await?;
        let lang = detect_language(abs_path).ok_or(IndexError::UnsupportedLanguage)?;

        let chunks = chunk_file(&source, rel_path, lang, &self.config.chunker)?;

        // Batch-check which hashes already exist to avoid N individual queries.
        let all_hashes: Vec<&str> = chunks.iter().map(|c| c.content_hash.as_str()).collect();
        let existing = self.store.existing_hashes(&all_hashes).await?;

        let mut new_chunks: Vec<CodeChunk> = Vec::new();
        let mut skipped = 0usize;

        for chunk in chunks {
            if existing.contains(&chunk.content_hash) {
                skipped += 1;
            } else {
                new_chunks.push(chunk);
            }
        }

        if new_chunks.is_empty() {
            return Ok((0, skipped));
        }

        // Embed all new chunks in a single batch call, then zip with inserts.
        let embedding_texts: Vec<String> =
            new_chunks.iter().map(contextualize_for_embedding).collect();
        let text_refs: Vec<&str> = embedding_texts.iter().map(String::as_str).collect();
        let vectors = self.provider.embed_batch(&text_refs).await?;

        let batch: Vec<(ChunkInsert<'_>, Vec<f32>)> = new_chunks
            .iter()
            .zip(vectors)
            .map(|(chunk, vector)| (chunk_to_insert(chunk), vector))
            .collect();

        let created = self.store.upsert_chunks_batch(batch).await?.len();

        if created > 0 {
            tracing::debug!("{rel_path}: {created} chunks indexed, {skipped} unchanged");
        }

        Ok((created, skipped))
    }
}

fn chunk_to_insert(chunk: &CodeChunk) -> ChunkInsert<'_> {
    ChunkInsert {
        file_path: &chunk.file_path,
        language: chunk.language.id(),
        node_type: &chunk.node_type,
        entity_name: chunk.entity_name.as_deref(),
        line_start: chunk.line_range.0,
        line_end: chunk.line_range.1,
        code: &chunk.code,
        scope_chain: &chunk.scope_chain,
        content_hash: &chunk.content_hash,
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn index_progress_default() {
        let p = IndexProgress::default();
        assert_eq!(p.files_done, 0);
        assert_eq!(p.files_total, 0);
        assert_eq!(p.chunks_created, 0);
    }

    #[test]
    fn progress_send_no_receivers_is_ignored() {
        let (tx, rx) = tokio::sync::watch::channel(IndexProgress::default());
        drop(rx);
        // send with no receivers must not panic
        let _ = tx.send(IndexProgress {
            files_done: 1,
            files_total: 5,
            chunks_created: 3,
        });
    }

    #[test]
    fn progress_send_multiple_times_accumulates() {
        let (tx, rx) = tokio::sync::watch::channel(IndexProgress::default());
        for i in 1..=3usize {
            let _ = tx.send(IndexProgress {
                files_done: i,
                files_total: 3,
                chunks_created: i * 2,
            });
        }
        let p = rx.borrow();
        assert_eq!(p.files_done, 3);
        assert_eq!(p.files_total, 3);
        assert_eq!(p.chunks_created, 6);
    }

    #[test]
    fn progress_none_tx_skips_send() {
        // When progress_tx is None the loop body must not panic — verified by
        // constructing the same conditional used in index_project.
        let progress_tx: Option<&tokio::sync::watch::Sender<IndexProgress>> = None;
        let entries = [1usize, 2, 3];
        for (i, _) in entries.iter().enumerate() {
            if let Some(tx) = progress_tx {
                let _ = tx.send(IndexProgress {
                    files_done: i + 1,
                    files_total: entries.len(),
                    chunks_created: 0,
                });
            }
        }
        // reaching here means no panic when tx is None
    }

    #[test]
    fn chunk_to_insert_maps_fields() {
        let chunk = CodeChunk {
            code: "fn test() {}".to_string(),
            file_path: "src/lib.rs".to_string(),
            language: crate::languages::Lang::Rust,
            node_type: "function_item".to_string(),
            entity_name: Some("test".to_string()),
            line_range: (1, 3),
            scope_chain: "Foo".to_string(),
            imports: String::new(),
            content_hash: "abc".to_string(),
        };

        let insert = chunk_to_insert(&chunk);
        assert_eq!(insert.file_path, "src/lib.rs");
        assert_eq!(insert.language, "rust");
        assert_eq!(insert.entity_name, Some("test"));
        assert_eq!(insert.line_start, 1);
        assert_eq!(insert.line_end, 3);
    }

    #[test]
    fn default_config() {
        let config = IndexerConfig::default();
        assert_eq!(config.chunker.target_size, 600);
        assert_eq!(config.concurrency, 4);
        assert_eq!(config.batch_size, 32);
        assert_eq!(config.embed_concurrency, 2);
    }

    #[test]
    fn indexer_config_custom_concurrency_and_batch_size() {
        let config = IndexerConfig {
            concurrency: 8,
            batch_size: 64,
            ..IndexerConfig::default()
        };
        assert_eq!(config.concurrency, 8);
        assert_eq!(config.batch_size, 64);
    }

    #[test]
    fn index_report_defaults() {
        let report = IndexReport::default();
        assert_eq!(report.files_scanned, 0);
        assert!(report.errors.is_empty());
    }
}