vectorless 0.1.21

// Copyright (c) 2026 vectorless developers
// SPDX-License-Identifier: Apache-2.0

//! BM25 scoring module using the `bm25` crate.
//!
//! This module provides:
//! - Per-field weighting for document scoring
//! - Configurable length normalization
//! - IDF caching for efficient scoring
//! - Query expansion support

use std::collections::HashMap;

use bm25::{
    Embedder, EmbedderBuilder, Embedding, Language, Scorer, ScoredDocument,
    DefaultTokenizer, Tokenizer,
};

/// Field weights for BM25 scoring.
///
/// Different document fields can have different importance.
/// For example, title matches are typically more important than content matches.
#[derive(Debug, Clone, Copy)]
pub struct FieldWeights {
    /// Weight for title field matches.
    pub title: f32,
    /// Weight for summary field matches.
    pub summary: f32,
    /// Weight for content field matches.
    pub content: f32,
}

impl Default for FieldWeights {
    fn default() -> Self {
        Self {
            title: 2.0,
            summary: 1.5,
            content: 1.0,
        }
    }
}

/// BM25 parameters for fine-tuning.
#[derive(Debug, Clone, Copy)]
pub struct Bm25Params {
    /// Term frequency saturation parameter (k1).
    /// Controls how quickly term frequency saturates.
    /// Typical value: 1.2
    pub k1: f32,
    /// Length normalization parameter (b).
    /// Controls how much document length affects scoring.
    /// - 0.0: No length normalization
    /// - 1.0: Full length normalization
    /// Typical value: 0.75
    pub b: f32,
    /// Average document length.
    /// If not known, can be estimated or set to 1.0 with b=0.
    pub avgdl: f32,
}

impl Default for Bm25Params {
    fn default() -> Self {
        Self {
            k1: 1.2,
            b: 0.75,
            avgdl: 100.0,
        }
    }
}

/// A document with multiple fields for scoring.
#[derive(Debug, Clone)]
pub struct FieldDocument<K> {
    /// Document identifier.
    pub id: K,
    /// Title field.
    pub title: String,
    /// Summary field.
    pub summary: String,
    /// Content field.
    pub content: String,
}

impl<K> FieldDocument<K> {
    /// Create a new field document.
    pub fn new(id: K, title: String, summary: String, content: String) -> Self {
        Self { id, title, summary, content }
    }

    /// Get combined text for embedding.
    fn combined_text(&self) -> String {
        format!("{} {} {}", self.title, self.summary, self.content)
    }
}

/// Key for field-specific document storage.
#[derive(Debug, Clone, Hash, Eq, PartialEq)]
struct FieldKey<K> {
    doc_id: K,
    field: Field,
}

#[derive(Debug, Clone, Copy, Hash, Eq, PartialEq)]
enum Field {
    Title,
    Summary,
    Content,
}

/// BM25 engine with per-field weighting support.
///
/// This wraps the `bm25` crate's Embedder and Scorer to provide:
/// - Per-field weighting
/// - Configurable parameters
/// - IDF caching (handled internally by Scorer)
pub struct Bm25Engine<K> {
    /// The embedder for creating sparse vectors.
    embedder: Embedder,
    /// The scorer for scoring documents (combined text).
    scorer: Scorer<K>,
    /// Field-specific scorers for weighted scoring.
    title_scorer: Scorer<K>,
    summary_scorer: Scorer<K>,
    content_scorer: Scorer<K>,
    /// Field weights.
    weights: FieldWeights,
    /// Document count.
    doc_count: usize,
    /// Whether the engine has been fitted to a corpus.
    fitted: bool,
}

impl<K: std::hash::Hash + Eq + Clone + std::fmt::Debug> Bm25Engine<K> {
    /// Create a new BM25 engine with default parameters.
    pub fn new() -> Self {
        Self::with_params(Bm25Params::default())
    }

    /// Create a BM25 engine with custom parameters.
    pub fn with_params(params: Bm25Params) -> Self {
        let embedder = EmbedderBuilder::with_avgdl(params.avgdl)
            .k1(params.k1)
            .b(params.b)
            .language_mode(Language::English)
            .build();

        Self {
            embedder,
            scorer: Scorer::new(),
            title_scorer: Scorer::new(),
            summary_scorer: Scorer::new(),
            content_scorer: Scorer::new(),
            weights: FieldWeights::default(),
            doc_count: 0,
            fitted: false,
        }
    }

    /// Create a BM25 engine fitted to a corpus.
    ///
    /// This calculates the true average document length from the corpus.
    pub fn fit_to_corpus(documents: &[FieldDocument<K>]) -> Self {
        // Collect owned strings first
        let corpus: Vec<String> = documents.iter()
            .map(|d| d.combined_text())
            .collect();
        let corpus_refs: Vec<&str> = corpus.iter().map(|s| s.as_str()).collect();

        let embedder = EmbedderBuilder::with_fit_to_corpus(Language::English, &corpus_refs)
            .build();

        let mut engine = Self {
            embedder,
            scorer: Scorer::new(),
            title_scorer: Scorer::new(),
            summary_scorer: Scorer::new(),
            content_scorer: Scorer::new(),
            weights: FieldWeights::default(),
            doc_count: 0,
            fitted: true,
        };

        // Index all documents
        for doc in documents {
            engine.upsert(doc);
        }

        engine
    }

    /// Set field weights.
    pub fn with_weights(mut self, weights: FieldWeights) -> Self {
        self.weights = weights;
        self
    }

    /// Set language for tokenization.
    pub fn with_language(mut self, language: Language) -> Self {
        self.embedder = EmbedderBuilder::with_avgdl(self.embedder.avgdl())
            .language_mode(language)
            .build();
        self
    }

    /// Get the average document length.
    pub fn avgdl(&self) -> f32 {
        self.embedder.avgdl()
    }

    /// Check if the engine has been fitted to a corpus.
    pub fn is_fitted(&self) -> bool {
        self.fitted
    }

    /// Upsert a document into the index.
    ///
    /// This stores embeddings for each field separately for weighted scoring.
    pub fn upsert(&mut self, document: &FieldDocument<K>) {
        let id = &document.id;

        // Embed and store each field separately
        let title_emb = self.embedder.embed(&document.title);
        let summary_emb = self.embedder.embed(&document.summary);
        let content_emb = self.embedder.embed(&document.content);

        self.title_scorer.upsert(id, title_emb);
        self.summary_scorer.upsert(id, summary_emb);
        self.content_scorer.upsert(id, content_emb);

        // Also store combined embedding for basic search
        let combined = self.embedder.embed(&document.combined_text());
        self.scorer.upsert(id, combined);

        self.doc_count += 1;
    }

    /// Remove a document from the index.
    pub fn remove(&mut self, id: &K) {
        self.scorer.remove(id);
        self.title_scorer.remove(id);
        self.summary_scorer.remove(id);
        self.content_scorer.remove(id);
        self.doc_count = self.doc_count.saturating_sub(1);
    }

    /// Get the number of indexed documents.
    pub fn len(&self) -> usize {
        self.doc_count
    }

    /// Check if the index is empty.
    pub fn is_empty(&self) -> bool {
        self.doc_count == 0
    }

    /// Score a single document against a query.
    ///
    /// Returns None if the document is not in the index.
    pub fn score(&self, id: &K, query: &str) -> Option<f32> {
        let query_emb = self.embedder.embed(query);

        // Score each field
        let title_score = self.title_scorer.score(id, &query_emb)?;
        let summary_score = self.summary_scorer.score(id, &query_emb)?;
        let content_score = self.content_scorer.score(id, &query_emb)?;

        // Weighted combination
        let total_weight = self.weights.title + self.weights.summary + self.weights.content;
        let weighted_score = (title_score * self.weights.title
            + summary_score * self.weights.summary
            + content_score * self.weights.content) / total_weight;

        Some(weighted_score)
    }

    /// Search for documents matching a query.
    ///
    /// Returns documents sorted by score (descending).
    pub fn search(&self, query: &str, limit: usize) -> Vec<ScoredDocument<K>> {
        let query_emb = self.embedder.embed(query);
        self.scorer.matches(&query_emb).into_iter().take(limit).collect()
    }

    /// Search with per-field weighting.
    ///
    /// This is slower but provides more accurate weighted scores.
    pub fn search_weighted(&self, query: &str, limit: usize) -> Vec<(K, f32)> {
        let query_emb = self.embedder.embed(query);

        // Get all document IDs from the main scorer
        let all_results = self.scorer.matches(&query_emb);

        let mut scored: Vec<(K, f32)> = all_results
            .into_iter()
            .filter_map(|scored_doc| {
                let id = scored_doc.id;

                // Get per-field scores
                let title_score = self.title_scorer.score(&id, &query_emb)?;
                let summary_score = self.summary_scorer.score(&id, &query_emb)?;
                let content_score = self.content_scorer.score(&id, &query_emb)?;

                let total_weight = self.weights.title + self.weights.summary + self.weights.content;
                let weighted_score = (title_score * self.weights.title
                    + summary_score * self.weights.summary
                    + content_score * self.weights.content) / total_weight;

                Some((id, weighted_score))
            })
            .collect();

        scored.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
        scored.truncate(limit);
        scored
    }

    /// Extract keywords from a query (tokenize and filter).
    pub fn tokenize(&self, text: &str) -> Vec<String> {
        let tokenizer = DefaultTokenizer::builder()
            .language_mode(Language::English)
            .normalization(true)
            .stopwords(true)
            .stemming(true)
            .build();
        tokenizer.tokenize(text)
    }

    /// Get the underlying embedder.
    pub fn embedder(&self) -> &Embedder {
        &self.embedder
    }

    /// Get mutable access to the embedder.
    pub fn embedder_mut(&mut self) -> &mut Embedder {
        &mut self.embedder
    }
}

impl<K: std::hash::Hash + Eq + Clone + std::fmt::Debug> Default for Bm25Engine<K> {
    fn default() -> Self {
        Self::new()
    }
}

/// Query expansion result from LLM.
#[derive(Debug, Clone)]
pub struct ExpandedQuery {
    /// Original query.
    pub original: String,
    /// Expanded terms.
    pub expansions: Vec<String>,
    /// Combined query (original + expansions).
    pub combined: String,
}

impl ExpandedQuery {
    /// Create a new expanded query.
    pub fn new(original: String, expansions: Vec<String>) -> Self {
        let combined = format!("{} {}", original, expansions.join(" "));
        Self { original, expansions, combined }
    }
}

/// Query expander trait for LLM-based expansion.
#[async_trait::async_trait]
pub trait QueryExpander: Send + Sync {
    /// Expand a query with related terms.
    async fn expand(&self, query: &str) -> ExpandedQuery;
}

/// Common English stop words for keyword filtering.
pub const STOPWORDS: &[&str] = &[
    "a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
    "have", "has", "had", "do", "does", "did", "will", "would", "could",
    "should", "may", "might", "must", "shall", "can", "need", "dare",
    "ought", "used", "to", "of", "in", "for", "on", "with", "at", "by",
    "from", "as", "into", "through", "during", "before", "after", "above",
    "below", "between", "under", "again", "further", "then", "once",
    "here", "there", "when", "where", "why", "how", "all", "each", "few",
    "more", "most", "other", "some", "such", "no", "nor", "not", "only",
    "own", "same", "so", "than", "too", "very", "just", "and", "but",
    "if", "or", "because", "until", "while", "about", "what", "which",
    "who", "whom", "this", "that", "these", "those", "i", "me", "my",
    "myself", "we", "our", "ours", "ourselves", "you", "your", "yours",
    "yourself", "yourselves", "he", "him", "his", "himself", "she", "her",
    "hers", "herself", "it", "its", "itself", "they", "them", "their",
    "theirs", "themselves",
];

/// Extract keywords from a query string, filtering stop words.
///
/// This is a simple keyword extraction that:
/// - Converts to lowercase
/// - Splits on non-alphanumeric characters
/// - Filters out stop words
/// - Requires minimum length of 2 characters
#[must_use]
pub fn extract_keywords(query: &str) -> Vec<String> {
    query
        .to_lowercase()
        .split(|c: char| !c.is_alphanumeric())
        .filter(|s| {
            let s = *s;
            !s.is_empty() && s.len() > 1 && !STOPWORDS.contains(&s)
        })
        .map(String::from)
        .collect()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_bm25_engine_creation() {
        let engine: Bm25Engine<u32> = Bm25Engine::new();
        assert!(engine.is_empty());
        assert!(!engine.is_fitted());
    }

    #[test]
    fn test_bm25_engine_fit_to_corpus() {
        let docs = vec![
            FieldDocument::new(1u32, "Rust Programming".to_string(), "About Rust".to_string(), "Rust is a systems programming language.".to_string()),
            FieldDocument::new(2u32, "Python Guide".to_string(), "About Python".to_string(), "Python is a scripting language.".to_string()),
        ];

        let engine = Bm25Engine::fit_to_corpus(&docs);
        assert!(engine.is_fitted());
        assert_eq!(engine.len(), 2);
    }

    #[test]
    fn test_bm25_search() {
        let docs = vec![
            FieldDocument::new(1u32, "Rust Programming".to_string(), "About Rust".to_string(), "Rust is a systems programming language with memory safety.".to_string()),
            FieldDocument::new(2u32, "Python Guide".to_string(), "About Python".to_string(), "Python is a scripting language for data science.".to_string()),
            FieldDocument::new(3u32, "Rust Memory Safety".to_string(), "Memory in Rust".to_string(), "Rust provides guaranteed memory safety without garbage collection.".to_string()),
        ];

        let engine = Bm25Engine::fit_to_corpus(&docs);
        let results = engine.search("rust memory", 10);

        assert!(!results.is_empty());
        // Documents about Rust should rank higher
        assert!(results.iter().any(|r| r.id == 1 || r.id == 3));
    }

    #[test]
    fn test_bm25_weighted_search() {
        let docs = vec![
            FieldDocument::new(1u32, "Rust Programming".to_string(), "About memory safety".to_string(), "Content about other things.".to_string()),
            FieldDocument::new(2u32, "Other Language".to_string(), "About other things".to_string(), "Rust memory safety is important.".to_string()),
        ];

        let engine = Bm25Engine::fit_to_corpus(&docs)
            .with_weights(FieldWeights {
                title: 3.0,
                summary: 2.0,
                content: 1.0,
            });

        let results = engine.search_weighted("rust", 10);

        // Doc 1 has "Rust" in title, should rank higher
        assert_eq!(results.first().map(|(id, _)| *id), Some(1u32));
    }

    #[test]
    fn test_bm25_score() {
        let docs = vec![
            FieldDocument::new(1u32, "Rust Programming".to_string(), "About Rust".to_string(), "Rust is a systems programming language.".to_string()),
        ];

        let engine = Bm25Engine::fit_to_corpus(&docs);
        let score = engine.score(&1u32, "rust programming");

        assert!(score.is_some());
        assert!(score.unwrap() > 0.0);
    }

    #[test]
    fn test_bm25_tokenize() {
        let engine: Bm25Engine<u32> = Bm25Engine::new();
        let tokens = engine.tokenize("What is the Rust programming language?");

        // Should filter stop words and stem
        assert!(tokens.contains(&"rust".to_string()));
        assert!(tokens.contains(&"program".to_string())); // stemmed
        assert!(!tokens.contains(&"what".to_string())); // stop word
        assert!(!tokens.contains(&"the".to_string())); // stop word
    }

    #[test]
    fn test_bm25_remove() {
        let docs = vec![
            FieldDocument::new(1u32, "Rust".to_string(), "About Rust".to_string(), "Rust content.".to_string()),
        ];

        let mut engine = Bm25Engine::fit_to_corpus(&docs);
        assert_eq!(engine.len(), 1);

        engine.remove(&1u32);
        assert!(engine.is_empty());
    }

    #[test]
    fn test_field_weights_default() {
        let weights = FieldWeights::default();
        assert!((weights.title - 2.0).abs() < f32::EPSILON);
        assert!((weights.summary - 1.5).abs() < f32::EPSILON);
        assert!((weights.content - 1.0).abs() < f32::EPSILON);
    }

    #[test]
    fn test_bm25_params_default() {
        let params = Bm25Params::default();
        assert!((params.k1 - 1.2).abs() < f32::EPSILON);
        assert!((params.b - 0.75).abs() < f32::EPSILON);
        assert!((params.avgdl - 100.0).abs() < f32::EPSILON);
    }

    #[test]
    fn test_expanded_query() {
        let expanded = ExpandedQuery::new(
            "rust".to_string(),
            vec!["programming".to_string(), "language".to_string()],
        );

        assert_eq!(expanded.original, "rust");
        assert_eq!(expanded.expansions.len(), 2);
        assert_eq!(expanded.combined, "rust programming language");
    }
}