aprender-core 0.29.1

Next-generation machine learning library in pure Rust
//! Content-based recommendation using TF-IDF + HNSW.
//!
//! This recommender finds similar items based on text content using:
//! - TF-IDF vectorization for feature extraction
//! - Incremental IDF for streaming updates
//! - HNSW index for fast approximate nearest neighbor search
//!
//! # Algorithm
//!
//! 1. Text → TF-IDF vectors (with incremental IDF tracking)
//! 2. Vectors → HNSW index (O(log n) insertion)
//! 3. Query: HNSW search (O(log n) retrieval)
//!
//! # Complexity
//!
//! - Add item: O(log n × d) where n=items, d=vocabulary size
//! - Recommend: O(log n × d)
//! - Space: O(n × d)
//!
//! # Examples
//!
//! ```
//! use aprender::recommend::ContentRecommender;
//!
//! let mut rec = ContentRecommender::new(16, 200, 0.95);
//!
//! // Add documents
//! rec.add_item("ml_intro", "introduction to machine learning");
//! rec.add_item("dl_basics", "deep learning fundamentals");
//! rec.add_item("ml_practice", "practical machine learning guide");
//!
//! // Get similar items
//! let similar = rec.recommend("ml_intro", 2).expect("recommend should succeed");
//!
//! assert_eq!(similar.len(), 2);
//! // Should recommend ml_practice (shares "machine learning")
//! ```

use crate::error::AprenderError;
use crate::index::hnsw::HNSWIndex;
use crate::primitives::Vector;
use crate::text::incremental_idf::IncrementalIDF;
use crate::text::tokenize::WhitespaceTokenizer;
use crate::text::Tokenizer;
use std::collections::HashMap;

/// Content-based recommender using TF-IDF + HNSW.
///
/// # Type Parameters
///
/// None - uses f64 internally for maximum precision
///
/// # Configuration
///
/// - `m`: HNSW connections per node (12-48 recommended)
/// - `ef_construction`: HNSW construction parameter (100-200 recommended)
/// - `decay_factor`: IDF decay factor (0.9-0.99 recommended)
#[derive(Debug)]
pub struct ContentRecommender {
    /// HNSW index for fast similarity search
    hnsw: HNSWIndex,
    /// Incremental IDF tracker
    idf: IncrementalIDF,
    /// Item ID to text content mapping
    item_content: HashMap<String, String>,
    /// Tokenizer
    tokenizer: WhitespaceTokenizer,
}

impl ContentRecommender {
    /// Create a new content-based recommender.
    ///
    /// # Arguments
    ///
    /// * `m` - HNSW connections per node (12-48 recommended)
    /// * `ef_construction` - HNSW construction parameter (100-200 recommended)
    /// * `decay_factor` - IDF decay factor (0.9-0.99 recommended)
    ///
    /// # Examples
    ///
    /// ```
    /// use aprender::recommend::ContentRecommender;
    ///
    /// let rec = ContentRecommender::new(16, 200, 0.95);
    /// ```
    #[must_use]
    pub fn new(m: usize, ef_construction: usize, decay_factor: f64) -> Self {
        Self {
            hnsw: HNSWIndex::new(m, ef_construction, 0.0),
            idf: IncrementalIDF::new(decay_factor),
            item_content: HashMap::new(),
            tokenizer: WhitespaceTokenizer::new(),
        }
    }

    /// Add an item to the recommender.
    ///
    /// # Arguments
    ///
    /// * `item_id` - Unique identifier for the item
    /// * `content` - Text content to extract features from
    ///
    /// # Examples
    ///
    /// ```
    /// use aprender::recommend::ContentRecommender;
    ///
    /// let mut rec = ContentRecommender::new(16, 200, 0.95);
    /// rec.add_item("item1", "machine learning tutorial");
    /// rec.add_item("item2", "deep learning guide");
    /// ```
    pub fn add_item(&mut self, item_id: impl Into<String>, content: impl Into<String>) {
        let item_id = item_id.into();
        let content = content.into();

        // Track vocabulary size before update
        let vocab_size_before = self.idf.len();

        // Tokenize
        let tokens: Vec<String> = self.tokenizer.tokenize(&content).unwrap_or_default();

        // Get unique terms for IDF update
        let unique_terms: Vec<String> = tokens
            .iter()
            .map(|s| s.to_lowercase())
            .collect::<std::collections::HashSet<_>>()
            .into_iter()
            .collect();

        // Update IDF with unique terms
        let term_refs: Vec<&str> = unique_terms.iter().map(String::as_str).collect();
        self.idf.update(&term_refs);

        // Store content first (needed for rebuild)
        self.item_content.insert(item_id.clone(), content);

        // Check if vocabulary grew
        let vocab_size_after = self.idf.len();
        if vocab_size_after > vocab_size_before && !self.item_content.is_empty() {
            // Vocabulary grew - rebuild entire index to ensure dimensional consistency
            self.rebuild_index();
        } else {
            // Vocabulary unchanged - just add this item
            let tfidf_vec = self.compute_tfidf(&tokens);
            self.hnsw.add(item_id, tfidf_vec);
        }
    }

    /// Recommend similar items.
    ///
    /// # Arguments
    ///
    /// * `item_id` - Item to find similar items for
    /// * `k` - Number of recommendations to return
    ///
    /// # Returns
    ///
    /// List of (`item_id`, `similarity_score`) pairs, sorted by similarity (highest first)
    ///
    /// # Examples
    ///
    /// ```
    /// use aprender::recommend::ContentRecommender;
    ///
    /// let mut rec = ContentRecommender::new(16, 200, 0.95);
    /// rec.add_item("a", "machine learning");
    /// rec.add_item("b", "deep learning");
    /// rec.add_item("c", "machine intelligence");
    ///
    /// let similar = rec.recommend("a", 2).expect("recommend should succeed");
    /// assert_eq!(similar.len(), 2);
    /// ```
    pub fn recommend(&self, item_id: &str, k: usize) -> Result<Vec<(String, f64)>, AprenderError> {
        // Get item content
        let content = self
            .item_content
            .get(item_id)
            .ok_or_else(|| AprenderError::Other(format!("Item not found: {item_id}")))?;

        // Compute TF-IDF for query
        let tokens = self.tokenizer.tokenize(content)?;
        let query_vec = self.compute_tfidf(&tokens);

        // Search HNSW (returns k+1 to exclude query item)
        let results = self.hnsw.search(&query_vec, k + 1);

        // Filter out query item and convert distance to similarity
        let recommendations: Vec<(String, f64)> = results
            .into_iter()
            .filter(|(id, _)| id != item_id)
            .take(k)
            .map(|(id, dist)| {
                // Convert cosine distance to cosine similarity
                // distance = 1 - similarity, so similarity = 1 - distance
                let similarity = 1.0 - dist;
                (id, similarity)
            })
            .collect();

        Ok(recommendations)
    }

    /// Number of items in the recommender.
    ///
    /// # Examples
    ///
    /// ```
    /// use aprender::recommend::ContentRecommender;
    ///
    /// let mut rec = ContentRecommender::new(16, 200, 0.95);
    /// assert_eq!(rec.len(), 0);
    ///
    /// rec.add_item("item1", "content");
    /// assert_eq!(rec.len(), 1);
    /// ```
    #[must_use]
    pub fn len(&self) -> usize {
        self.item_content.len()
    }

    /// Check if recommender is empty.
    ///
    /// # Examples
    ///
    /// ```
    /// use aprender::recommend::ContentRecommender;
    ///
    /// let rec = ContentRecommender::new(16, 200, 0.95);
    /// assert!(rec.is_empty());
    /// ```
    #[must_use]
    pub fn is_empty(&self) -> bool {
        self.item_content.is_empty()
    }

    /// Rebuild the HNSW index with re-vectorized items.
    ///
    /// This is called when vocabulary grows to ensure all vectors have consistent dimensions.
    /// All items are re-vectorized using the current (expanded) vocabulary.
    fn rebuild_index(&mut self) {
        // Create new HNSW index
        self.hnsw = HNSWIndex::new(self.hnsw.m(), self.hnsw.ef_construction(), 0.0);

        // Re-vectorize and re-add all items
        for (item_id, content) in &self.item_content {
            let tokens: Vec<String> = self.tokenizer.tokenize(content).unwrap_or_default();
            let tfidf_vec = self.compute_tfidf(&tokens);
            self.hnsw.add(item_id.clone(), tfidf_vec);
        }
    }

    /// Compute TF-IDF vector for tokens.
    fn compute_tfidf(&self, tokens: &[String]) -> Vector<f64> {
        // Compute term frequencies
        let mut tf: HashMap<String, f64> = HashMap::new();
        for token in tokens {
            let term = token.to_lowercase();
            *tf.entry(term).or_insert(0.0) += 1.0;
        }

        // Normalize TF by max frequency
        let max_tf = tf.values().copied().fold(0.0, f64::max);
        if max_tf > 0.0 {
            for value in tf.values_mut() {
                *value /= max_tf;
            }
        }

        // Get all vocabulary terms in sorted order for consistency
        let mut vocab: Vec<String> = self.idf.terms().keys().cloned().collect();
        vocab.sort();

        // Compute TF-IDF vector (sparse representation as dense vector)
        let tfidf: Vec<f64> = vocab
            .iter()
            .map(|term| {
                let tf_val = tf.get(term).copied().unwrap_or(0.0);
                let idf_val = self.idf.idf(term);
                tf_val * idf_val
            })
            .collect();

        Vector::from_vec(tfidf)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_empty_recommender() {
        let rec = ContentRecommender::new(16, 200, 0.95);
        assert!(rec.is_empty());
        assert_eq!(rec.len(), 0);
    }

    #[test]
    fn test_add_single_item() {
        let mut rec = ContentRecommender::new(16, 200, 0.95);
        rec.add_item("item1", "machine learning");
        assert_eq!(rec.len(), 1);
        assert!(!rec.is_empty());
    }

    #[test]
    fn test_recommend_similar_items() {
        let mut rec = ContentRecommender::new(16, 200, 0.95);

        rec.add_item("ml_intro", "machine learning introduction");
        rec.add_item("dl_guide", "deep learning neural networks");
        rec.add_item("ml_practice", "machine learning applications");

        let similar = rec.recommend("ml_intro", 2).expect("should succeed");

        assert_eq!(similar.len(), 2);
        // ml_practice should be more similar than dl_guide (shares "machine learning")
        assert_eq!(similar[0].0, "ml_practice");
    }

    #[test]
    fn test_recommend_nonexistent_item() {
        let mut rec = ContentRecommender::new(16, 200, 0.95);
        rec.add_item("item1", "content");

        let result = rec.recommend("nonexistent", 1);
        assert!(result.is_err());
        if let Err(err) = result {
            assert!(matches!(err, AprenderError::Other(_)));
        }
    }

    #[test]
    fn test_similarity_scores() {
        let mut rec = ContentRecommender::new(16, 200, 0.95);

        // Test that dimensional consistency is maintained when vocabulary grows.
        // The recommender should automatically rebuild the index when new terms appear.
        rec.add_item("a", "machine learning");
        rec.add_item("b", "deep learning");
        rec.add_item("c", "data science");

        let similar = rec.recommend("a", 2).expect("should succeed");

        // Should get 2 recommendations
        assert_eq!(similar.len(), 2);

        // All similarities should be finite (not NaN or inf)
        for (id, sim) in &similar {
            assert!(
                sim.is_finite(),
                "Similarity for {id} should be finite, got {sim}"
            );
        }

        // "b" should be most similar to "a" (both contain "learning")
        assert_eq!(similar[0].0, "b");
        assert!(
            similar[0].1 > 0.0,
            "Similarity should be positive: {}",
            similar[0].1
        );
    }

    #[test]
    fn test_empty_content() {
        let mut rec = ContentRecommender::new(16, 200, 0.95);

        rec.add_item("empty", "");
        rec.add_item("normal", "machine learning");

        // Should not panic on empty content
        let similar = rec.recommend("normal", 1);
        assert!(similar.is_ok());
    }

    #[test]
    fn test_case_insensitive() {
        let mut rec = ContentRecommender::new(16, 200, 0.95);

        rec.add_item("a", "Machine Learning");
        rec.add_item("b", "machine learning");
        rec.add_item("c", "MACHINE LEARNING");

        let similar = rec.recommend("a", 2).expect("should succeed");

        // All should be considered similar (case-insensitive)
        assert_eq!(similar.len(), 2);
        for (_, sim) in similar {
            assert!(
                sim > 0.9,
                "Similar terms should have high similarity despite case"
            );
        }
    }
}