collet 0.1.1

Relentless agentic coding orchestrator with zero-drop agent loops
Documentation
use std::path::{Path, PathBuf};

use crate::search::tokenizer;

use super::{Bm25Index, Document};

impl Bm25Index {
    /// Index a single file into the index.
    pub(super) fn index_file(&mut self, file: &Path) {
        let content = match std::fs::read_to_string(file) {
            Ok(c) => c,
            Err(_) => return,
        };

        let rel_path = file
            .strip_prefix(&self.root)
            .unwrap_or(file)
            .to_string_lossy()
            .to_string();

        // Use shared tokenizer for path + code tokenization
        let mut tf = tokenizer::tokenize_code(&content);
        for token in tokenizer::tokenize_path(&rel_path) {
            *tf.entry(token).or_default() += 1;
        }

        let token_count: u32 = tf.values().sum();
        self.total_tokens += token_count as u64;

        // Update doc_freq: +1 for each unique term in this document
        for key in tf.keys() {
            *self.doc_freq.entry(key.clone()).or_default() += 1;
        }

        self.documents.insert(
            file.to_path_buf(),
            Document {
                rel_path,
                abs_path: file.to_path_buf(),
                tf,
                token_count,
            },
        );
    }

    /// Remove a single file from the index, adjusting doc_freq and total_tokens.
    pub(super) fn remove_file(&mut self, file: &Path) {
        if let Some(old_doc) = self.documents.remove(file) {
            self.total_tokens = self.total_tokens.saturating_sub(old_doc.token_count as u64);

            // Subtract this document's contribution from doc_freq
            for key in old_doc.tf.keys() {
                if let Some(count) = self.doc_freq.get_mut(key) {
                    *count = count.saturating_sub(1);
                    if *count == 0 {
                        self.doc_freq.remove(key);
                    }
                }
            }
        }
    }

    /// Core ranking: returns (rel_path, abs_path, score) sorted by score.
    pub(super) fn rank(&self, query: &str, max_results: usize) -> Vec<(String, PathBuf, f64)> {
        use crate::search::scoring::{Bm25Params, bm25_score};

        const PARAMS: Bm25Params = Bm25Params { k1: 1.2, b: 0.5 };

        if self.documents.is_empty() {
            return Vec::new();
        }

        let query_tokens = tokenizer::tokenize_query(query);
        if query_tokens.is_empty() {
            return Vec::new();
        }

        let avg_dl = self.avg_doc_len();
        let n = self.documents.len();

        let mut scores: Vec<(&Document, f64)> = self
            .documents
            .values()
            .filter_map(|doc| {
                let score = bm25_score(
                    &doc.tf,
                    doc.token_count,
                    &query_tokens,
                    &self.doc_freq,
                    n,
                    avg_dl,
                    &PARAMS,
                );
                if score > 0.0 {
                    Some((doc, score))
                } else {
                    None
                }
            })
            .collect();

        scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
        scores.truncate(max_results);

        scores
            .into_iter()
            .map(|(doc, score)| (doc.rel_path.clone(), doc.abs_path.clone(), score))
            .collect()
    }
}