Skip to main content

aprender/recommend/
content_based.rs

1//! Content-based recommendation using TF-IDF + HNSW.
2//!
3//! This recommender finds similar items based on text content using:
4//! - TF-IDF vectorization for feature extraction
5//! - Incremental IDF for streaming updates
6//! - HNSW index for fast approximate nearest neighbor search
7//!
8//! # Algorithm
9//!
10//! 1. Text → TF-IDF vectors (with incremental IDF tracking)
11//! 2. Vectors → HNSW index (O(log n) insertion)
12//! 3. Query: HNSW search (O(log n) retrieval)
13//!
14//! # Complexity
15//!
16//! - Add item: O(log n × d) where n=items, d=vocabulary size
17//! - Recommend: O(log n × d)
18//! - Space: O(n × d)
19//!
20//! # Examples
21//!
22//! ```
23//! use aprender::recommend::ContentRecommender;
24//!
25//! let mut rec = ContentRecommender::new(16, 200, 0.95);
26//!
27//! // Add documents
28//! rec.add_item("ml_intro", "introduction to machine learning");
29//! rec.add_item("dl_basics", "deep learning fundamentals");
30//! rec.add_item("ml_practice", "practical machine learning guide");
31//!
32//! // Get similar items
33//! let similar = rec.recommend("ml_intro", 2).expect("recommend should succeed");
34//!
35//! assert_eq!(similar.len(), 2);
36//! // Should recommend ml_practice (shares "machine learning")
37//! ```
38
39use crate::error::AprenderError;
40use crate::index::hnsw::HNSWIndex;
41use crate::primitives::Vector;
42use crate::text::incremental_idf::IncrementalIDF;
43use crate::text::tokenize::WhitespaceTokenizer;
44use crate::text::Tokenizer;
45use std::collections::HashMap;
46
47/// Content-based recommender using TF-IDF + HNSW.
48///
49/// # Type Parameters
50///
51/// None - uses f64 internally for maximum precision
52///
53/// # Configuration
54///
55/// - `m`: HNSW connections per node (12-48 recommended)
56/// - `ef_construction`: HNSW construction parameter (100-200 recommended)
57/// - `decay_factor`: IDF decay factor (0.9-0.99 recommended)
58#[derive(Debug)]
59pub struct ContentRecommender {
60    /// HNSW index for fast similarity search
61    hnsw: HNSWIndex,
62    /// Incremental IDF tracker
63    idf: IncrementalIDF,
64    /// Item ID to text content mapping
65    item_content: HashMap<String, String>,
66    /// Tokenizer
67    tokenizer: WhitespaceTokenizer,
68}
69
70impl ContentRecommender {
71    /// Create a new content-based recommender.
72    ///
73    /// # Arguments
74    ///
75    /// * `m` - HNSW connections per node (12-48 recommended)
76    /// * `ef_construction` - HNSW construction parameter (100-200 recommended)
77    /// * `decay_factor` - IDF decay factor (0.9-0.99 recommended)
78    ///
79    /// # Examples
80    ///
81    /// ```
82    /// use aprender::recommend::ContentRecommender;
83    ///
84    /// let rec = ContentRecommender::new(16, 200, 0.95);
85    /// ```
86    #[must_use]
87    pub fn new(m: usize, ef_construction: usize, decay_factor: f64) -> Self {
88        Self {
89            hnsw: HNSWIndex::new(m, ef_construction, 0.0),
90            idf: IncrementalIDF::new(decay_factor),
91            item_content: HashMap::new(),
92            tokenizer: WhitespaceTokenizer::new(),
93        }
94    }
95
96    /// Add an item to the recommender.
97    ///
98    /// # Arguments
99    ///
100    /// * `item_id` - Unique identifier for the item
101    /// * `content` - Text content to extract features from
102    ///
103    /// # Examples
104    ///
105    /// ```
106    /// use aprender::recommend::ContentRecommender;
107    ///
108    /// let mut rec = ContentRecommender::new(16, 200, 0.95);
109    /// rec.add_item("item1", "machine learning tutorial");
110    /// rec.add_item("item2", "deep learning guide");
111    /// ```
112    pub fn add_item(&mut self, item_id: impl Into<String>, content: impl Into<String>) {
113        let item_id = item_id.into();
114        let content = content.into();
115
116        // Track vocabulary size before update
117        let vocab_size_before = self.idf.len();
118
119        // Tokenize
120        let tokens: Vec<String> = self.tokenizer.tokenize(&content).unwrap_or_default();
121
122        // Get unique terms for IDF update
123        let unique_terms: Vec<String> = tokens
124            .iter()
125            .map(|s| s.to_lowercase())
126            .collect::<std::collections::HashSet<_>>()
127            .into_iter()
128            .collect();
129
130        // Update IDF with unique terms
131        let term_refs: Vec<&str> = unique_terms.iter().map(String::as_str).collect();
132        self.idf.update(&term_refs);
133
134        // Store content first (needed for rebuild)
135        self.item_content.insert(item_id.clone(), content);
136
137        // Check if vocabulary grew
138        let vocab_size_after = self.idf.len();
139        if vocab_size_after > vocab_size_before && !self.item_content.is_empty() {
140            // Vocabulary grew - rebuild entire index to ensure dimensional consistency
141            self.rebuild_index();
142        } else {
143            // Vocabulary unchanged - just add this item
144            let tfidf_vec = self.compute_tfidf(&tokens);
145            self.hnsw.add(item_id, tfidf_vec);
146        }
147    }
148
149    /// Recommend similar items.
150    ///
151    /// # Arguments
152    ///
153    /// * `item_id` - Item to find similar items for
154    /// * `k` - Number of recommendations to return
155    ///
156    /// # Returns
157    ///
158    /// List of (`item_id`, `similarity_score`) pairs, sorted by similarity (highest first)
159    ///
160    /// # Examples
161    ///
162    /// ```
163    /// use aprender::recommend::ContentRecommender;
164    ///
165    /// let mut rec = ContentRecommender::new(16, 200, 0.95);
166    /// rec.add_item("a", "machine learning");
167    /// rec.add_item("b", "deep learning");
168    /// rec.add_item("c", "machine intelligence");
169    ///
170    /// let similar = rec.recommend("a", 2).expect("recommend should succeed");
171    /// assert_eq!(similar.len(), 2);
172    /// ```
173    pub fn recommend(&self, item_id: &str, k: usize) -> Result<Vec<(String, f64)>, AprenderError> {
174        // Get item content
175        let content = self
176            .item_content
177            .get(item_id)
178            .ok_or_else(|| AprenderError::Other(format!("Item not found: {item_id}")))?;
179
180        // Compute TF-IDF for query
181        let tokens = self.tokenizer.tokenize(content)?;
182        let query_vec = self.compute_tfidf(&tokens);
183
184        // Search HNSW (returns k+1 to exclude query item)
185        let results = self.hnsw.search(&query_vec, k + 1);
186
187        // Filter out query item and convert distance to similarity
188        let recommendations: Vec<(String, f64)> = results
189            .into_iter()
190            .filter(|(id, _)| id != item_id)
191            .take(k)
192            .map(|(id, dist)| {
193                // Convert cosine distance to cosine similarity
194                // distance = 1 - similarity, so similarity = 1 - distance
195                let similarity = 1.0 - dist;
196                (id, similarity)
197            })
198            .collect();
199
200        Ok(recommendations)
201    }
202
203    /// Number of items in the recommender.
204    ///
205    /// # Examples
206    ///
207    /// ```
208    /// use aprender::recommend::ContentRecommender;
209    ///
210    /// let mut rec = ContentRecommender::new(16, 200, 0.95);
211    /// assert_eq!(rec.len(), 0);
212    ///
213    /// rec.add_item("item1", "content");
214    /// assert_eq!(rec.len(), 1);
215    /// ```
216    #[must_use]
217    pub fn len(&self) -> usize {
218        self.item_content.len()
219    }
220
221    /// Check if recommender is empty.
222    ///
223    /// # Examples
224    ///
225    /// ```
226    /// use aprender::recommend::ContentRecommender;
227    ///
228    /// let rec = ContentRecommender::new(16, 200, 0.95);
229    /// assert!(rec.is_empty());
230    /// ```
231    #[must_use]
232    pub fn is_empty(&self) -> bool {
233        self.item_content.is_empty()
234    }
235
236    /// Rebuild the HNSW index with re-vectorized items.
237    ///
238    /// This is called when vocabulary grows to ensure all vectors have consistent dimensions.
239    /// All items are re-vectorized using the current (expanded) vocabulary.
240    fn rebuild_index(&mut self) {
241        // Create new HNSW index
242        self.hnsw = HNSWIndex::new(self.hnsw.m(), self.hnsw.ef_construction(), 0.0);
243
244        // Re-vectorize and re-add all items
245        for (item_id, content) in &self.item_content {
246            let tokens: Vec<String> = self.tokenizer.tokenize(content).unwrap_or_default();
247            let tfidf_vec = self.compute_tfidf(&tokens);
248            self.hnsw.add(item_id.clone(), tfidf_vec);
249        }
250    }
251
252    /// Compute TF-IDF vector for tokens.
253    fn compute_tfidf(&self, tokens: &[String]) -> Vector<f64> {
254        // Compute term frequencies
255        let mut tf: HashMap<String, f64> = HashMap::new();
256        for token in tokens {
257            let term = token.to_lowercase();
258            *tf.entry(term).or_insert(0.0) += 1.0;
259        }
260
261        // Normalize TF by max frequency
262        let max_tf = tf.values().copied().fold(0.0, f64::max);
263        if max_tf > 0.0 {
264            for value in tf.values_mut() {
265                *value /= max_tf;
266            }
267        }
268
269        // Get all vocabulary terms in sorted order for consistency
270        let mut vocab: Vec<String> = self.idf.terms().keys().cloned().collect();
271        vocab.sort();
272
273        // Compute TF-IDF vector (sparse representation as dense vector)
274        let tfidf: Vec<f64> = vocab
275            .iter()
276            .map(|term| {
277                let tf_val = tf.get(term).copied().unwrap_or(0.0);
278                let idf_val = self.idf.idf(term);
279                tf_val * idf_val
280            })
281            .collect();
282
283        Vector::from_vec(tfidf)
284    }
285}
286
287#[cfg(test)]
288mod tests {
289    use super::*;
290
291    #[test]
292    fn test_empty_recommender() {
293        let rec = ContentRecommender::new(16, 200, 0.95);
294        assert!(rec.is_empty());
295        assert_eq!(rec.len(), 0);
296    }
297
298    #[test]
299    fn test_add_single_item() {
300        let mut rec = ContentRecommender::new(16, 200, 0.95);
301        rec.add_item("item1", "machine learning");
302        assert_eq!(rec.len(), 1);
303        assert!(!rec.is_empty());
304    }
305
306    #[test]
307    fn test_recommend_similar_items() {
308        let mut rec = ContentRecommender::new(16, 200, 0.95);
309
310        rec.add_item("ml_intro", "machine learning introduction");
311        rec.add_item("dl_guide", "deep learning neural networks");
312        rec.add_item("ml_practice", "machine learning applications");
313
314        let similar = rec.recommend("ml_intro", 2).expect("should succeed");
315
316        assert_eq!(similar.len(), 2);
317        // ml_practice should be more similar than dl_guide (shares "machine learning")
318        assert_eq!(similar[0].0, "ml_practice");
319    }
320
321    #[test]
322    fn test_recommend_nonexistent_item() {
323        let mut rec = ContentRecommender::new(16, 200, 0.95);
324        rec.add_item("item1", "content");
325
326        let result = rec.recommend("nonexistent", 1);
327        assert!(result.is_err());
328        if let Err(err) = result {
329            assert!(matches!(err, AprenderError::Other(_)));
330        }
331    }
332
333    #[test]
334    fn test_similarity_scores() {
335        let mut rec = ContentRecommender::new(16, 200, 0.95);
336
337        // Test that dimensional consistency is maintained when vocabulary grows.
338        // The recommender should automatically rebuild the index when new terms appear.
339        rec.add_item("a", "machine learning");
340        rec.add_item("b", "deep learning");
341        rec.add_item("c", "data science");
342
343        let similar = rec.recommend("a", 2).expect("should succeed");
344
345        // Should get 2 recommendations
346        assert_eq!(similar.len(), 2);
347
348        // All similarities should be finite (not NaN or inf)
349        for (id, sim) in &similar {
350            assert!(
351                sim.is_finite(),
352                "Similarity for {id} should be finite, got {sim}"
353            );
354        }
355
356        // "b" should be most similar to "a" (both contain "learning")
357        assert_eq!(similar[0].0, "b");
358        assert!(
359            similar[0].1 > 0.0,
360            "Similarity should be positive: {}",
361            similar[0].1
362        );
363    }
364
365    #[test]
366    fn test_empty_content() {
367        let mut rec = ContentRecommender::new(16, 200, 0.95);
368
369        rec.add_item("empty", "");
370        rec.add_item("normal", "machine learning");
371
372        // Should not panic on empty content
373        let similar = rec.recommend("normal", 1);
374        assert!(similar.is_ok());
375    }
376
377    #[test]
378    fn test_case_insensitive() {
379        let mut rec = ContentRecommender::new(16, 200, 0.95);
380
381        rec.add_item("a", "Machine Learning");
382        rec.add_item("b", "machine learning");
383        rec.add_item("c", "MACHINE LEARNING");
384
385        let similar = rec.recommend("a", 2).expect("should succeed");
386
387        // All should be considered similar (case-insensitive)
388        assert_eq!(similar.len(), 2);
389        for (_, sim) in similar {
390            assert!(
391                sim > 0.9,
392                "Similar terms should have high similarity despite case"
393            );
394        }
395    }
396}