aurora_db/search.rs
1//! # Aurora Full-Text Search
2//!
3//! This module provides full-text search capabilities for Aurora documents.
4//! It implements an inverted index with BM25-inspired ranking and fuzzy matching support.
5//!
6//! ## Features
7//!
8//! * **BM25 Ranking**: Documents are scored by relevance using a BM25-inspired algorithm
9//! * **Fuzzy Matching**: Find words with typos or spelling variations
10//! * **Custom Stopwords**: Add your own stopwords for filtering if needed
11//! * **Unicode Support**: Properly handles different languages and character sets
12//!
13//! ## Example
14//!
15//! ```rust
16//! // Basic search
17//! let results = db.search("products")
18//! .field("description")
19//! .matching("wireless headphones")
20//! .collect()
21//! .await?;
22//!
23//! // With fuzzy matching for typo tolerance
24//! let fuzzy_results = db.search("products")
25//! .field("description")
26//! .matching("wireles headphnes")
27//! .fuzzy(true)
28//! .collect()
29//! .await?;
30//! ```
31
32use crate::error::Result;
33use crate::types::{Document, Value};
34use dashmap::DashMap;
35use std::collections::{HashMap, HashSet};
36use std::sync::Arc;
37use unicode_segmentation::UnicodeSegmentation;
38
39/// Full-text search index for a collection and field
40///
41/// This struct maintains an inverted index mapping terms to the documents
42/// that contain them, with term frequency information for relevance scoring.
43#[allow(dead_code)]
44pub struct FullTextIndex {
45 /// Name of the collection being indexed
46 field: String,
47 index: Arc<DashMap<String, Vec<(String, f32)>>>, // term -> [(doc_id, score)]
48 document_count: Arc<std::sync::atomic::AtomicUsize>,
49 stop_words: HashSet<String>,
50 enable_stop_words: bool,
51}
52
53impl FullTextIndex {
54 /// Create a new full-text index for a collection field
55 ///
56 /// # Arguments
57 /// * `collection` - Name of the collection to index
58 /// * `field` - Name of the field within documents to index
59 ///
60 /// # Examples
61 /// ```
62 /// let description_index = FullTextIndex::new("products", "description");
63 /// ```
64 pub fn new(_collection: &str, field: &str) -> Self {
65 Self {
66 field: field.to_string(),
67 index: Arc::new(DashMap::new()),
68 document_count: Arc::new(std::sync::atomic::AtomicUsize::new(0)),
69 stop_words: HashSet::new(),
70 enable_stop_words: false,
71 }
72 }
73
74 /// Create a search index with specific options
75 ///
76 /// # Arguments
77 /// * `collection` - Name of the collection to index
78 /// * `field` - Name of the field within documents to index
79 /// * `enable_stop_words` - Whether to filter out common words (false = no filtering)
80 ///
81 /// # Examples
82 /// ```
83 /// // Create an index that doesn't filter out common words
84 /// let index = FullTextIndex::with_options("products", "description", false);
85 ///
86 /// // Create an index that does filter out common words
87 /// let index = FullTextIndex::with_options("articles", "content", true);
88 /// ```
89 #[allow(dead_code)]
90 pub fn with_options(collection: &str, field: &str, enable_stop_words: bool) -> Self {
91 let mut index = Self::new(collection, field);
92 index.enable_stop_words = enable_stop_words;
93 index
94 }
95
96 /// Set whether to filter out common stop words
97 ///
98 /// # Arguments
99 /// * `enable` - true to filter out common words, false to include all words
100 ///
101 /// # Examples
102 /// ```
103 /// // Enable filtering of common words like "the", "and", etc.
104 /// index.set_stop_words_filter(true);
105 ///
106 /// // Disable filtering to allow searching for all words
107 /// index.set_stop_words_filter(false);
108 /// ```
109 #[allow(dead_code)]
110 pub fn set_stop_words_filter(&mut self, enable: bool) {
111 self.enable_stop_words = enable;
112 }
113
114 /// Add or update a document in the index
115 ///
116 /// # Arguments
117 /// * `doc` - The document to index
118 ///
119 /// # Returns
120 /// A Result indicating success or an error
121 ///
122 /// # Examples
123 /// ```
124 /// index.index_document(&document)?;
125 /// ```
126 pub fn index_document(&self, doc: &Document) -> Result<()> {
127 if let Some(Value::String(text)) = doc.data.get(&self.field) {
128 let terms = self.tokenize(text);
129 if !terms.is_empty() {
130 self.document_count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
131 }
132
133 let term_freq = self.calculate_term_frequencies(&terms);
134
135 for (term, freq) in term_freq {
136 let score = freq / terms.len() as f32; // TF score
137 self.index
138 .entry(term)
139 .or_default()
140 .push((doc.id.clone(), score));
141 }
142 }
143 Ok(())
144 }
145
146 /// Search the index for documents matching a query
147 ///
148 /// # Arguments
149 /// * `query` - The search query text
150 ///
151 /// # Returns
152 /// A vector of document IDs and relevance scores
153 ///
154 /// # Examples
155 /// ```
156 /// let matches = index.search("wireless headphones");
157 /// ```
158 pub fn search(&self, query: &str) -> Vec<(String, f32)> {
159 let query_terms = self.tokenize(query);
160 if query_terms.is_empty() {
161 return Vec::new();
162 }
163
164 let mut scores: HashMap<String, f32> = HashMap::new();
165 let total_docs = self.document_count.load(std::sync::atomic::Ordering::Relaxed).max(1) as f32;
166
167 for term in query_terms {
168 let Some(term_entry) = self.index.get(&term) else { continue };
169
170 // BM25-inspired scoring (simpler than full BM25)
171 let doc_freq = term_entry.len() as f32;
172 let idf = (1.0 + (total_docs - doc_freq + 0.5) / (doc_freq + 0.5)).ln();
173
174 term_entry.iter().for_each(|(doc_id, tf)| {
175 // Apply BM25-like k1 and b parameters (simplified)
176 let k1 = 1.2;
177 let tf_adjusted = ((k1 + 1.0) * tf) / (k1 + tf);
178
179 *scores.entry(doc_id.clone()).or_insert(0.0) += tf_adjusted * idf;
180 });
181 }
182
183 let mut results: Vec<_> = scores.into_iter().collect();
184 results.sort_by(|(_, score1), (_, score2)| score2.partial_cmp(score1).unwrap());
185 results
186 }
187
188 /// Search with fuzzy matching for typo tolerance
189 ///
190 /// # Arguments
191 /// * `query` - The search query text
192 /// * `max_distance` - Maximum edit distance for fuzzy matching
193 ///
194 /// # Returns
195 /// A vector of document IDs and relevance scores
196 ///
197 /// # Examples
198 /// ```
199 /// // Allow up to 2 character differences
200 /// let matches = index.fuzzy_search("wireles headpones", 2);
201 /// ```
202 #[allow(dead_code)]
203 pub fn fuzzy_search(&self, query: &str, max_distance: usize) -> Vec<(String, f32)> {
204 let query_terms = self.tokenize(query);
205 if query_terms.is_empty() {
206 return Vec::new();
207 }
208
209 let mut scores: HashMap<String, f32> = HashMap::new();
210
211 // Get all indexed terms
212 let all_terms: Vec<String> = self.index.iter().map(|e| e.key().clone()).collect();
213
214 for query_term in query_terms {
215 // Find fuzzy matches
216 for indexed_term in &all_terms {
217 if query_term.len() > 3 && self.levenshtein_distance(&query_term, indexed_term) <= max_distance {
218 if let Some(matches) = self.index.get(indexed_term) {
219 // Discount score based on distance
220 let distance = self.levenshtein_distance(&query_term, indexed_term) as f32;
221 let distance_factor = 1.0 / (1.0 + distance);
222
223 for (doc_id, score) in matches.iter() {
224 *scores.entry(doc_id.clone()).or_insert(0.0) += score * distance_factor;
225 }
226 }
227 }
228 }
229 }
230
231 let mut results: Vec<_> = scores.into_iter().collect();
232 results.sort_by(|(_, score1), (_, score2)| score2.partial_cmp(score1).unwrap());
233 results
234 }
235
236 /// Break text into tokens (words) for indexing or searching
237 ///
238 /// # Arguments
239 /// * `text` - The text to tokenize
240 ///
241 /// # Returns
242 /// A vector of token strings
243 fn tokenize(&self, text: &str) -> Vec<String> {
244 let mut tokens: Vec<String> = text.unicode_words()
245 .map(|word| word.to_lowercase())
246 .collect();
247
248 // Filter out stop words if enabled
249 if self.enable_stop_words {
250 tokens.retain(|token| !self.stop_words.contains(token));
251 }
252
253 tokens
254 }
255
256 /// Calculate term frequencies for a set of terms
257 ///
258 /// # Returns
259 /// A map of term to frequency
260 fn calculate_term_frequencies(&self, terms: &[String]) -> HashMap<String, f32> {
261 let mut freq = HashMap::new();
262 for term in terms {
263 *freq.entry(term.clone()).or_insert(0.0) += 1.0;
264 }
265 freq
266 }
267
268 /// Get the total number of documents in the index
269 ///
270 /// # Returns
271 /// The number of documents indexed
272 #[allow(dead_code)]
273 fn get_total_docs(&self) -> usize {
274 self.document_count.load(std::sync::atomic::Ordering::Relaxed)
275 }
276
277 /// Calculate Levenshtein distance between two strings
278 ///
279 /// Used for fuzzy matching to handle typos and spelling variations
280 ///
281 /// # Arguments
282 /// * `s1` - First string
283 /// * `s2` - Second string
284 ///
285 /// # Returns
286 /// The edit distance (number of changes needed to transform s1 into s2)
287 fn levenshtein_distance(&self, s1: &str, s2: &str) -> usize {
288 let s1_len = s1.len();
289 let s2_len = s2.len();
290
291 // Early return for edge cases
292 if s1_len == 0 { return s2_len; }
293 if s2_len == 0 { return s1_len; }
294
295 let mut prev_row = (0..=s2_len).collect::<Vec<_>>();
296 let mut curr_row = vec![0; s2_len + 1];
297
298 for i in 1..=s1_len {
299 curr_row[0] = i;
300
301 for j in 1..=s2_len {
302 let cost = if s1.chars().nth(i-1) == s2.chars().nth(j-1) { 0 } else { 1 };
303 curr_row[j] = std::cmp::min(
304 curr_row[j-1] + 1, // insertion
305 std::cmp::min(
306 prev_row[j] + 1, // deletion
307 prev_row[j-1] + cost // substitution
308 )
309 );
310 }
311
312 // Swap rows
313 std::mem::swap(&mut prev_row, &mut curr_row);
314 }
315
316 prev_row[s2_len]
317 }
318
319 /// Remove a document from the index
320 ///
321 /// # Arguments
322 /// * `doc_id` - ID of the document to remove
323 ///
324 /// # Examples
325 /// ```
326 /// index.remove_document("doc-123");
327 /// ```
328 #[allow(dead_code)]
329 pub fn remove_document(&self, doc_id: &str) {
330 // Remove the document from all term entries
331 for mut entry in self.index.iter_mut() {
332 let doc_entries = entry.value_mut();
333 let original_len = doc_entries.len();
334 doc_entries.retain(|(id, _)| id != doc_id);
335
336 // If we removed an entry, decrement the document count
337 if doc_entries.len() < original_len {
338 self.document_count.fetch_sub(1, std::sync::atomic::Ordering::Relaxed);
339 // Only count the document once
340 break;
341 }
342 }
343 }
344
345 /// Add custom stopwords to the filter list
346 ///
347 /// # Arguments
348 /// * `words` - List of words to add to the stopword list
349 ///
350 /// # Examples
351 /// ```
352 /// // Add domain-specific stopwords
353 /// index.add_stopwords(&["widget", "product", "item"]);
354 /// ```
355 #[allow(dead_code)]
356 pub fn add_stopwords(&mut self, words: &[&str]) {
357 for word in words {
358 self.stop_words.insert(word.to_string());
359 }
360 }
361
362 /// Remove all stopwords from the filter
363 ///
364 /// # Examples
365 /// ```
366 /// // Allow searching for any word, including common ones
367 /// index.clear_stopwords();
368 /// ```
369 #[allow(dead_code)]
370 pub fn clear_stopwords(&mut self) {
371 self.stop_words.clear();
372 }
373
374 /// Get the current list of stopwords
375 ///
376 /// # Returns
377 /// A vector of stopwords currently in use
378 ///
379 /// # Examples
380 /// ```
381 /// let current_stopwords = index.get_stopwords();
382 /// println!("Currently filtering: {:?}", current_stopwords);
383 /// ```
384 #[allow(dead_code)]
385 pub fn get_stopwords(&self) -> Vec<String> {
386 self.stop_words.iter().cloned().collect()
387 }
388
389 #[allow(dead_code)]
390 #[allow(unused_variables)]
391 pub fn get_fuzzy_matches(&self, query: &str, max_distance: u32) -> Vec<(String, f32)> {
392 // ...
393 Vec::new()
394 }
395
396 #[allow(dead_code)]
397 #[allow(unused_variables)]
398 pub fn highlight_matches(&self, text: &str, query: &str) -> String {
399 // ...
400 String::new()
401 }
402}
403
404#[cfg(test)]
405mod tests {
406 use super::*;
407 use std::collections::HashMap;
408
409 #[test]
410 fn test_full_text_index() -> Result<()> {
411 let index = FullTextIndex::new("test_collection", "content");
412
413 // Test document indexing
414 let doc1 = Document {
415 id: "1".to_string(),
416 data: {
417 let mut map = HashMap::new();
418 map.insert("content".to_string(), Value::String("The quick brown fox".to_string()));
419 map
420 },
421 };
422
423 let doc2 = Document {
424 id: "2".to_string(),
425 data: {
426 let mut map = HashMap::new();
427 map.insert("content".to_string(), Value::String("The lazy brown dog".to_string()));
428 map
429 },
430 };
431
432 index.index_document(&doc1)?;
433 index.index_document(&doc2)?;
434
435 // Test search functionality
436 let results = index.search("brown");
437 assert_eq!(results.len(), 2);
438 assert!(results.iter().any(|(id, _)| id == "1"));
439 assert!(results.iter().any(|(id, _)| id == "2"));
440
441 let results = index.search("quick");
442 assert_eq!(results.len(), 1);
443 assert_eq!(results[0].0, "1");
444
445 Ok(())
446 }
447
448 #[test]
449 fn test_tokenization() -> Result<()> {
450 let index = FullTextIndex::new("test", "content");
451 let tokens = index.tokenize("The Quick-Brown FOX!");
452
453 assert_eq!(tokens, vec![
454 "the".to_string(),
455 "quick".to_string(),
456 "brown".to_string(),
457 "fox".to_string(),
458 ]);
459
460 Ok(())
461 }
462}