collet 0.1.1

Relentless agentic coding orchestrator with zero-drop agent loops
Documentation
pub mod bm25;
pub mod disk_cache;
pub mod hasher;
pub mod parser;
pub mod ranker;
pub mod scanner;

use std::collections::HashMap;
use std::path::{Path, PathBuf};

use crate::repo_map::bm25::{Bm25Index, IndexUpdate};
use crate::repo_map::hasher::{ChangeStatus, FileHashCache};
use crate::repo_map::parser::{Symbol, SymbolKind};
use crate::repo_map::ranker::SymbolRanker;
use crate::repo_map::scanner::scan_source_files;

/// The complete repository map with incremental rebuild support.
pub struct RepoMap {
    /// Per-file symbol cache.
    symbols: HashMap<PathBuf, Vec<Symbol>>,
    /// BLAKE3-based change detection.
    hash_cache: FileHashCache,
    /// PageRank-inspired symbol ranker.
    ranker: SymbolRanker,
    /// BM25 full-text search index.
    bm25: Bm25Index,
    /// The root directory of the project.
    root: PathBuf,
    /// Max approximate tokens for the map output.
    max_tokens: usize,
    /// Whether this is the first build (no prior index exists).
    first_build: bool,
    /// Cached output of `to_map_string()`. Invalidated on rebuild.
    /// Uses `Mutex` so the cache can be populated from `&self` calls
    /// while remaining `Sync` (RepoMap is shared via RwLock across threads).
    cached_map_string: std::sync::Mutex<Option<String>>,
}

impl RepoMap {
    pub fn new(root: &Path) -> Self {
        // Attempt to restore from disk cache.
        if let Some((symbols, hash_cache, bm25, ranker)) = disk_cache::load(root) {
            tracing::info!(
                files = symbols.len(),
                "RepoMap restored from disk cache — incremental rebuild only",
            );
            return Self {
                symbols,
                hash_cache,
                ranker,
                bm25,
                root: root.to_path_buf(),
                max_tokens: 2000,
                first_build: false, // skip full BM25 build
                cached_map_string: std::sync::Mutex::new(None),
            };
        }

        Self {
            symbols: HashMap::new(),
            hash_cache: FileHashCache::new(),
            ranker: SymbolRanker::new(),
            bm25: Bm25Index::new(),
            root: root.to_path_buf(),
            max_tokens: 2000,
            first_build: true,
            cached_map_string: std::sync::Mutex::new(None),
        }
    }

    /// Build or incrementally update the repo map. Returns number of files re-parsed.
    pub fn rebuild(&mut self) -> usize {
        use rayon::prelude::*;

        let files = scan_source_files(&self.root);

        // Detect deleted files
        let current_files: std::collections::HashSet<_> = files.iter().cloned().collect();
        self.symbols.retain(|k, _| current_files.contains(k));

        // Collect BM25 incremental updates
        let mut bm25_updates: Vec<IndexUpdate> = Vec::new();

        // Step 1 (serial): determine which files need re-parsing via hash check
        let mut to_parse: Vec<std::path::PathBuf> = Vec::new();
        for file in &files {
            let status = self.hash_cache.check_file(file);
            match status {
                ChangeStatus::Unchanged => continue,
                ChangeStatus::Modified | ChangeStatus::New => {
                    to_parse.push(file.clone());
                    bm25_updates.push(IndexUpdate::Upsert(file.clone()));
                }
                ChangeStatus::Error => {
                    tracing::warn!("Failed to check file: {}", file.display());
                }
            }
        }

        // Step 2 (parallel): parse all changed files concurrently
        let parsed: Vec<(std::path::PathBuf, Vec<Symbol>)> = to_parse
            .par_iter()
            .filter_map(|file| {
                parser::parse_file(file)
                    .ok()
                    .map(|syms| (file.clone(), syms))
            })
            .collect();

        // Step 3 (serial): update symbol map
        let re_parsed = parsed.len();
        for (file, symbols) in parsed {
            self.symbols.insert(file, symbols);
        }

        // Rebuild reference counts for ranking
        if re_parsed > 0 {
            self.ranker.build_references(&self.symbols, &self.root);
        }

        // BM25: full build on first run, incremental thereafter
        if self.first_build {
            self.bm25.build(&files, &self.root);
            self.first_build = false;
        } else {
            // Detect deleted files: compare indexed paths to the current file set.
            // Emit IndexUpdate::Remove for any previously-indexed file no longer present.
            let indexed_paths: std::collections::HashSet<std::path::PathBuf> = self
                .bm25
                .indexed_files()
                .into_iter()
                .map(|rel| self.root.join(rel))
                .collect();
            for deleted in indexed_paths.difference(&current_files) {
                if self.bm25.contains(deleted) {
                    bm25_updates.push(IndexUpdate::Remove(deleted.clone()));
                }
            }

            if !bm25_updates.is_empty() {
                self.bm25.update(&bm25_updates, &self.root);
            }

            // Belt-and-suspenders: evict any stale docs that may have been
            // missed by the incremental path (e.g. external file deletions
            // between two rebuild() calls).
            self.bm25.retain_files(&current_files);
        }

        // Invalidate cached map string — will be lazily recomputed.
        if re_parsed > 0 {
            *self.cached_map_string.lock().unwrap() = None;
        }

        tracing::info!(
            "Repo map: {} files, {} re-parsed, BM25: {} docs / {} terms",
            self.symbols.len(),
            re_parsed,
            self.bm25.doc_count(),
            self.bm25.term_count(),
        );

        // Persist to disk cache (best-effort, non-blocking for callers).
        if (re_parsed > 0 || self.symbols.is_empty())
            && let Err(e) = disk_cache::save(
                &self.root,
                &self.symbols,
                &self.hash_cache,
                &self.bm25,
                &self.ranker,
            )
        {
            tracing::warn!("Failed to save RepoMap disk cache: {e}");
        }

        re_parsed
    }

    /// Iterate over all (file, symbols) pairs.
    pub fn all_symbols(&self) -> impl Iterator<Item = (&PathBuf, &Vec<Symbol>)> {
        self.symbols.iter()
    }

    /// Set conversation-relevant files for priority ranking.
    pub fn set_conversation_files(&mut self, files: Vec<PathBuf>) {
        self.ranker.set_conversation_files(files);
        // Ranking changed — invalidate cached string.
        *self.cached_map_string.lock().unwrap() = None;
    }

    /// Invalidate cache for a specific file (after tool edits it).
    ///
    /// Also removes the file from the BM25 index if it was indexed,
    /// so search results don't reflect stale content.
    pub fn invalidate(&mut self, path: &Path) {
        self.hash_cache.invalidate(path);
        *self.cached_map_string.lock().unwrap() = None;
        // Remove from BM25 index so stale content isn't returned by search.
        if self.bm25.contains(path) {
            let indexed = self.bm25.indexed_files().len();
            self.bm25
                .update(&[bm25::IndexUpdate::Remove(path.to_path_buf())], &self.root);
            tracing::trace!(
                path = %path.display(),
                was_indexed = indexed > 0,
                "BM25: removed invalidated file"
            );
        }
    }

    /// Generate the map text for LLM context injection, ranked by importance.
    /// Result is cached until the next `rebuild()` modifies symbols.
    pub fn to_map_string(&self) -> String {
        {
            let cached = self.cached_map_string.lock().unwrap();
            if let Some(ref s) = *cached {
                return s.clone();
            }
        }
        let result = self.to_map_string_inner();
        *self.cached_map_string.lock().unwrap() = Some(result.clone());
        result
    }

    fn to_map_string_inner(&self) -> String {
        let mut output = String::new();
        let mut approx_tokens = 0;

        // Rank files by PageRank-style importance
        let ranked = self.ranker.rank_files(&self.symbols);

        for (file, symbols, _score) in ranked {
            if approx_tokens >= self.max_tokens {
                output.push_str("\n[... truncated to fit context window]\n");
                break;
            }

            if symbols.is_empty() {
                continue;
            }

            let rel = file.strip_prefix(&self.root).unwrap_or(file);
            let file_header = format!("{}:\n", rel.display());
            output.push_str(&file_header);
            approx_tokens += file_header.len() / 4;

            for sym in symbols {
                let kind_str = match sym.kind {
                    SymbolKind::Function => "fn",
                    SymbolKind::Struct => "struct",
                    SymbolKind::Enum => "enum",
                    SymbolKind::Trait => "trait",
                    SymbolKind::Impl => "impl",
                    SymbolKind::Const => "const",
                    SymbolKind::Type => "type",
                    SymbolKind::Mod => "mod",
                    SymbolKind::Macro => "macro",
                };

                let line = if let Some(ref sig) = sym.signature {
                    format!("  {kind_str} {} [L{}] {sig}\n", sym.name, sym.line)
                } else {
                    format!("  {kind_str} {} [L{}]\n", sym.name, sym.line)
                };

                approx_tokens += line.len() / 4;
                output.push_str(&line);
            }
        }

        output
    }

    /// Total number of symbols tracked.
    pub fn symbol_count(&self) -> usize {
        self.symbols.values().map(|v| v.len()).sum()
    }

    /// Total number of files tracked.
    pub fn file_count(&self) -> usize {
        self.symbols.len()
    }

    /// BM25 search: find relevant files for a query string.
    pub fn search(&self, query: &str, max_results: usize) -> Vec<bm25::SearchResult> {
        self.bm25.search(query, max_results)
    }

    /// Get relevant files for AI context injection based on user query.
    ///
    /// Returns `(relative_path, score)` pairs ranked by BM25 relevance.
    pub fn relevant_files_for_query(&self, query: &str, top_k: usize) -> Vec<(String, f64)> {
        self.bm25.relevant_files(query, top_k)
    }

    /// Access the BM25 index directly.
    pub fn bm25(&self) -> &Bm25Index {
        &self.bm25
    }

    /// Returns true if the repo map has been built at least once.
    pub fn is_ready(&self) -> bool {
        !self.first_build
    }
}