aprender/recommend/content_based.rs
1//! Content-based recommendation using TF-IDF + HNSW.
2//!
3//! This recommender finds similar items based on text content using:
4//! - TF-IDF vectorization for feature extraction
5//! - Incremental IDF for streaming updates
6//! - HNSW index for fast approximate nearest neighbor search
7//!
8//! # Algorithm
9//!
10//! 1. Text → TF-IDF vectors (with incremental IDF tracking)
11//! 2. Vectors → HNSW index (O(log n) insertion)
12//! 3. Query: HNSW search (O(log n) retrieval)
13//!
14//! # Complexity
15//!
16//! - Add item: O(log n × d) where n=items, d=vocabulary size
17//! - Recommend: O(log n × d)
18//! - Space: O(n × d)
19//!
20//! # Examples
21//!
22//! ```
23//! use aprender::recommend::ContentRecommender;
24//!
25//! let mut rec = ContentRecommender::new(16, 200, 0.95);
26//!
27//! // Add documents
28//! rec.add_item("ml_intro", "introduction to machine learning");
29//! rec.add_item("dl_basics", "deep learning fundamentals");
30//! rec.add_item("ml_practice", "practical machine learning guide");
31//!
32//! // Get similar items
33//! let similar = rec.recommend("ml_intro", 2).expect("recommend should succeed");
34//!
35//! assert_eq!(similar.len(), 2);
36//! // Should recommend ml_practice (shares "machine learning")
37//! ```
38
39use crate::error::AprenderError;
40use crate::index::hnsw::HNSWIndex;
41use crate::primitives::Vector;
42use crate::text::incremental_idf::IncrementalIDF;
43use crate::text::tokenize::WhitespaceTokenizer;
44use crate::text::Tokenizer;
45use std::collections::HashMap;
46
47/// Content-based recommender using TF-IDF + HNSW.
48///
49/// # Type Parameters
50///
51/// None - uses f64 internally for maximum precision
52///
53/// # Configuration
54///
55/// - `m`: HNSW connections per node (12-48 recommended)
56/// - `ef_construction`: HNSW construction parameter (100-200 recommended)
57/// - `decay_factor`: IDF decay factor (0.9-0.99 recommended)
58#[derive(Debug)]
59pub struct ContentRecommender {
60 /// HNSW index for fast similarity search
61 hnsw: HNSWIndex,
62 /// Incremental IDF tracker
63 idf: IncrementalIDF,
64 /// Item ID to text content mapping
65 item_content: HashMap<String, String>,
66 /// Tokenizer
67 tokenizer: WhitespaceTokenizer,
68}
69
70impl ContentRecommender {
71 /// Create a new content-based recommender.
72 ///
73 /// # Arguments
74 ///
75 /// * `m` - HNSW connections per node (12-48 recommended)
76 /// * `ef_construction` - HNSW construction parameter (100-200 recommended)
77 /// * `decay_factor` - IDF decay factor (0.9-0.99 recommended)
78 ///
79 /// # Examples
80 ///
81 /// ```
82 /// use aprender::recommend::ContentRecommender;
83 ///
84 /// let rec = ContentRecommender::new(16, 200, 0.95);
85 /// ```
86 #[must_use]
87 pub fn new(m: usize, ef_construction: usize, decay_factor: f64) -> Self {
88 Self {
89 hnsw: HNSWIndex::new(m, ef_construction, 0.0),
90 idf: IncrementalIDF::new(decay_factor),
91 item_content: HashMap::new(),
92 tokenizer: WhitespaceTokenizer::new(),
93 }
94 }
95
96 /// Add an item to the recommender.
97 ///
98 /// # Arguments
99 ///
100 /// * `item_id` - Unique identifier for the item
101 /// * `content` - Text content to extract features from
102 ///
103 /// # Examples
104 ///
105 /// ```
106 /// use aprender::recommend::ContentRecommender;
107 ///
108 /// let mut rec = ContentRecommender::new(16, 200, 0.95);
109 /// rec.add_item("item1", "machine learning tutorial");
110 /// rec.add_item("item2", "deep learning guide");
111 /// ```
112 pub fn add_item(&mut self, item_id: impl Into<String>, content: impl Into<String>) {
113 let item_id = item_id.into();
114 let content = content.into();
115
116 // Track vocabulary size before update
117 let vocab_size_before = self.idf.len();
118
119 // Tokenize
120 let tokens: Vec<String> = self.tokenizer.tokenize(&content).unwrap_or_default();
121
122 // Get unique terms for IDF update
123 let unique_terms: Vec<String> = tokens
124 .iter()
125 .map(|s| s.to_lowercase())
126 .collect::<std::collections::HashSet<_>>()
127 .into_iter()
128 .collect();
129
130 // Update IDF with unique terms
131 let term_refs: Vec<&str> = unique_terms.iter().map(String::as_str).collect();
132 self.idf.update(&term_refs);
133
134 // Store content first (needed for rebuild)
135 self.item_content.insert(item_id.clone(), content);
136
137 // Check if vocabulary grew
138 let vocab_size_after = self.idf.len();
139 if vocab_size_after > vocab_size_before && !self.item_content.is_empty() {
140 // Vocabulary grew - rebuild entire index to ensure dimensional consistency
141 self.rebuild_index();
142 } else {
143 // Vocabulary unchanged - just add this item
144 let tfidf_vec = self.compute_tfidf(&tokens);
145 self.hnsw.add(item_id, tfidf_vec);
146 }
147 }
148
149 /// Recommend similar items.
150 ///
151 /// # Arguments
152 ///
153 /// * `item_id` - Item to find similar items for
154 /// * `k` - Number of recommendations to return
155 ///
156 /// # Returns
157 ///
158 /// List of (`item_id`, `similarity_score`) pairs, sorted by similarity (highest first)
159 ///
160 /// # Examples
161 ///
162 /// ```
163 /// use aprender::recommend::ContentRecommender;
164 ///
165 /// let mut rec = ContentRecommender::new(16, 200, 0.95);
166 /// rec.add_item("a", "machine learning");
167 /// rec.add_item("b", "deep learning");
168 /// rec.add_item("c", "machine intelligence");
169 ///
170 /// let similar = rec.recommend("a", 2).expect("recommend should succeed");
171 /// assert_eq!(similar.len(), 2);
172 /// ```
173 pub fn recommend(&self, item_id: &str, k: usize) -> Result<Vec<(String, f64)>, AprenderError> {
174 // Get item content
175 let content = self
176 .item_content
177 .get(item_id)
178 .ok_or_else(|| AprenderError::Other(format!("Item not found: {item_id}")))?;
179
180 // Compute TF-IDF for query
181 let tokens = self.tokenizer.tokenize(content)?;
182 let query_vec = self.compute_tfidf(&tokens);
183
184 // Search HNSW (returns k+1 to exclude query item)
185 let results = self.hnsw.search(&query_vec, k + 1);
186
187 // Filter out query item and convert distance to similarity
188 let recommendations: Vec<(String, f64)> = results
189 .into_iter()
190 .filter(|(id, _)| id != item_id)
191 .take(k)
192 .map(|(id, dist)| {
193 // Convert cosine distance to cosine similarity
194 // distance = 1 - similarity, so similarity = 1 - distance
195 let similarity = 1.0 - dist;
196 (id, similarity)
197 })
198 .collect();
199
200 Ok(recommendations)
201 }
202
203 /// Number of items in the recommender.
204 ///
205 /// # Examples
206 ///
207 /// ```
208 /// use aprender::recommend::ContentRecommender;
209 ///
210 /// let mut rec = ContentRecommender::new(16, 200, 0.95);
211 /// assert_eq!(rec.len(), 0);
212 ///
213 /// rec.add_item("item1", "content");
214 /// assert_eq!(rec.len(), 1);
215 /// ```
216 #[must_use]
217 pub fn len(&self) -> usize {
218 self.item_content.len()
219 }
220
221 /// Check if recommender is empty.
222 ///
223 /// # Examples
224 ///
225 /// ```
226 /// use aprender::recommend::ContentRecommender;
227 ///
228 /// let rec = ContentRecommender::new(16, 200, 0.95);
229 /// assert!(rec.is_empty());
230 /// ```
231 #[must_use]
232 pub fn is_empty(&self) -> bool {
233 self.item_content.is_empty()
234 }
235
236 /// Rebuild the HNSW index with re-vectorized items.
237 ///
238 /// This is called when vocabulary grows to ensure all vectors have consistent dimensions.
239 /// All items are re-vectorized using the current (expanded) vocabulary.
240 fn rebuild_index(&mut self) {
241 // Create new HNSW index
242 self.hnsw = HNSWIndex::new(self.hnsw.m(), self.hnsw.ef_construction(), 0.0);
243
244 // Re-vectorize and re-add all items
245 for (item_id, content) in &self.item_content {
246 let tokens: Vec<String> = self.tokenizer.tokenize(content).unwrap_or_default();
247 let tfidf_vec = self.compute_tfidf(&tokens);
248 self.hnsw.add(item_id.clone(), tfidf_vec);
249 }
250 }
251
252 /// Compute TF-IDF vector for tokens.
253 fn compute_tfidf(&self, tokens: &[String]) -> Vector<f64> {
254 // Compute term frequencies
255 let mut tf: HashMap<String, f64> = HashMap::new();
256 for token in tokens {
257 let term = token.to_lowercase();
258 *tf.entry(term).or_insert(0.0) += 1.0;
259 }
260
261 // Normalize TF by max frequency
262 let max_tf = tf.values().copied().fold(0.0, f64::max);
263 if max_tf > 0.0 {
264 for value in tf.values_mut() {
265 *value /= max_tf;
266 }
267 }
268
269 // Get all vocabulary terms in sorted order for consistency
270 let mut vocab: Vec<String> = self.idf.terms().keys().cloned().collect();
271 vocab.sort();
272
273 // Compute TF-IDF vector (sparse representation as dense vector)
274 let tfidf: Vec<f64> = vocab
275 .iter()
276 .map(|term| {
277 let tf_val = tf.get(term).copied().unwrap_or(0.0);
278 let idf_val = self.idf.idf(term);
279 tf_val * idf_val
280 })
281 .collect();
282
283 Vector::from_vec(tfidf)
284 }
285}
286
287#[cfg(test)]
288mod tests {
289 use super::*;
290
291 #[test]
292 fn test_empty_recommender() {
293 let rec = ContentRecommender::new(16, 200, 0.95);
294 assert!(rec.is_empty());
295 assert_eq!(rec.len(), 0);
296 }
297
298 #[test]
299 fn test_add_single_item() {
300 let mut rec = ContentRecommender::new(16, 200, 0.95);
301 rec.add_item("item1", "machine learning");
302 assert_eq!(rec.len(), 1);
303 assert!(!rec.is_empty());
304 }
305
306 #[test]
307 fn test_recommend_similar_items() {
308 let mut rec = ContentRecommender::new(16, 200, 0.95);
309
310 rec.add_item("ml_intro", "machine learning introduction");
311 rec.add_item("dl_guide", "deep learning neural networks");
312 rec.add_item("ml_practice", "machine learning applications");
313
314 let similar = rec.recommend("ml_intro", 2).expect("should succeed");
315
316 assert_eq!(similar.len(), 2);
317 // ml_practice should be more similar than dl_guide (shares "machine learning")
318 assert_eq!(similar[0].0, "ml_practice");
319 }
320
321 #[test]
322 fn test_recommend_nonexistent_item() {
323 let mut rec = ContentRecommender::new(16, 200, 0.95);
324 rec.add_item("item1", "content");
325
326 let result = rec.recommend("nonexistent", 1);
327 assert!(result.is_err());
328 if let Err(err) = result {
329 assert!(matches!(err, AprenderError::Other(_)));
330 }
331 }
332
333 #[test]
334 fn test_similarity_scores() {
335 let mut rec = ContentRecommender::new(16, 200, 0.95);
336
337 // Test that dimensional consistency is maintained when vocabulary grows.
338 // The recommender should automatically rebuild the index when new terms appear.
339 rec.add_item("a", "machine learning");
340 rec.add_item("b", "deep learning");
341 rec.add_item("c", "data science");
342
343 let similar = rec.recommend("a", 2).expect("should succeed");
344
345 // Should get 2 recommendations
346 assert_eq!(similar.len(), 2);
347
348 // All similarities should be finite (not NaN or inf)
349 for (id, sim) in &similar {
350 assert!(
351 sim.is_finite(),
352 "Similarity for {id} should be finite, got {sim}"
353 );
354 }
355
356 // "b" should be most similar to "a" (both contain "learning")
357 assert_eq!(similar[0].0, "b");
358 assert!(
359 similar[0].1 > 0.0,
360 "Similarity should be positive: {}",
361 similar[0].1
362 );
363 }
364
365 #[test]
366 fn test_empty_content() {
367 let mut rec = ContentRecommender::new(16, 200, 0.95);
368
369 rec.add_item("empty", "");
370 rec.add_item("normal", "machine learning");
371
372 // Should not panic on empty content
373 let similar = rec.recommend("normal", 1);
374 assert!(similar.is_ok());
375 }
376
377 #[test]
378 fn test_case_insensitive() {
379 let mut rec = ContentRecommender::new(16, 200, 0.95);
380
381 rec.add_item("a", "Machine Learning");
382 rec.add_item("b", "machine learning");
383 rec.add_item("c", "MACHINE LEARNING");
384
385 let similar = rec.recommend("a", 2).expect("should succeed");
386
387 // All should be considered similar (case-insensitive)
388 assert_eq!(similar.len(), 2);
389 for (_, sim) in similar {
390 assert!(
391 sim > 0.9,
392 "Similar terms should have high similarity despite case"
393 );
394 }
395 }
396}