aurora_db/
search.rs

1//! # Aurora Full-Text Search
2//! 
3//! This module provides full-text search capabilities for Aurora documents.
4//! It implements an inverted index with BM25-inspired ranking and fuzzy matching support.
5//! 
6//! ## Features
7//! 
8//! * **BM25 Ranking**: Documents are scored by relevance using a BM25-inspired algorithm
9//! * **Fuzzy Matching**: Find words with typos or spelling variations
10//! * **Custom Stopwords**: Add your own stopwords for filtering if needed
11//! * **Unicode Support**: Properly handles different languages and character sets
12//! 
13//! ## Example
14//! 
15//! ```rust
16//! // Basic search
17//! let results = db.search("products")
18//!     .field("description")
19//!     .matching("wireless headphones")
20//!     .collect()
21//!     .await?;
22//!     
23//! // With fuzzy matching for typo tolerance
24//! let fuzzy_results = db.search("products")
25//!     .field("description")
26//!     .matching("wireles headphnes")
27//!     .fuzzy(true)
28//!     .collect()
29//!     .await?;
30//! ```
31
32use crate::error::Result;
33use crate::types::{Document, Value};
34use dashmap::DashMap;
35use std::collections::{HashMap, HashSet};
36use std::sync::Arc;
37use unicode_segmentation::UnicodeSegmentation;
38
39/// Full-text search index for a collection and field
40///
41/// This struct maintains an inverted index mapping terms to the documents
42/// that contain them, with term frequency information for relevance scoring.
43#[allow(dead_code)]
44pub struct FullTextIndex {
45    /// Name of the collection being indexed
46    field: String,
47    index: Arc<DashMap<String, Vec<(String, f32)>>>, // term -> [(doc_id, score)]
48    document_count: Arc<std::sync::atomic::AtomicUsize>,
49    stop_words: HashSet<String>,
50    enable_stop_words: bool,
51}
52
53impl FullTextIndex {
54    /// Create a new full-text index for a collection field
55    ///
56    /// # Arguments
57    /// * `collection` - Name of the collection to index
58    /// * `field` - Name of the field within documents to index
59    ///
60    /// # Examples
61    /// ```
62    /// let description_index = FullTextIndex::new("products", "description");
63    /// ```
64    pub fn new(_collection: &str, field: &str) -> Self {
65        Self {
66            field: field.to_string(),
67            index: Arc::new(DashMap::new()),
68            document_count: Arc::new(std::sync::atomic::AtomicUsize::new(0)),
69            stop_words: HashSet::new(),
70            enable_stop_words: false,
71        }
72    }
73
74    /// Create a search index with specific options
75    ///
76    /// # Arguments
77    /// * `collection` - Name of the collection to index
78    /// * `field` - Name of the field within documents to index
79    /// * `enable_stop_words` - Whether to filter out common words (false = no filtering)
80    ///
81    /// # Examples
82    /// ```
83    /// // Create an index that doesn't filter out common words
84    /// let index = FullTextIndex::with_options("products", "description", false);
85    /// 
86    /// // Create an index that does filter out common words
87    /// let index = FullTextIndex::with_options("articles", "content", true);
88    /// ```
89    #[allow(dead_code)]
90    pub fn with_options(collection: &str, field: &str, enable_stop_words: bool) -> Self {
91        let mut index = Self::new(collection, field);
92        index.enable_stop_words = enable_stop_words;
93        index
94    }
95
96    /// Set whether to filter out common stop words
97    ///
98    /// # Arguments
99    /// * `enable` - true to filter out common words, false to include all words
100    ///
101    /// # Examples
102    /// ```
103    /// // Enable filtering of common words like "the", "and", etc.
104    /// index.set_stop_words_filter(true);
105    /// 
106    /// // Disable filtering to allow searching for all words
107    /// index.set_stop_words_filter(false);
108    /// ```
109    #[allow(dead_code)]
110    pub fn set_stop_words_filter(&mut self, enable: bool) {
111        self.enable_stop_words = enable;
112    }
113
114    /// Add or update a document in the index
115    ///
116    /// # Arguments
117    /// * `doc` - The document to index
118    ///
119    /// # Returns
120    /// A Result indicating success or an error
121    ///
122    /// # Examples
123    /// ```
124    /// index.index_document(&document)?;
125    /// ```
126    pub fn index_document(&self, doc: &Document) -> Result<()> {
127        if let Some(Value::String(text)) = doc.data.get(&self.field) {
128            let terms = self.tokenize(text);
129            if !terms.is_empty() {
130                self.document_count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
131            }
132            
133            let term_freq = self.calculate_term_frequencies(&terms);
134            
135            for (term, freq) in term_freq {
136                let score = freq / terms.len() as f32; // TF score
137                self.index
138                    .entry(term)
139                    .or_default()
140                    .push((doc.id.clone(), score));
141            }
142        }
143        Ok(())
144    }
145
146    /// Search the index for documents matching a query
147    ///
148    /// # Arguments
149    /// * `query` - The search query text
150    ///
151    /// # Returns
152    /// A vector of document IDs and relevance scores
153    ///
154    /// # Examples
155    /// ```
156    /// let matches = index.search("wireless headphones");
157    /// ```
158    pub fn search(&self, query: &str) -> Vec<(String, f32)> {
159        let query_terms = self.tokenize(query);
160        if query_terms.is_empty() {
161            return Vec::new();
162        }
163        
164        let mut scores: HashMap<String, f32> = HashMap::new();
165        let total_docs = self.document_count.load(std::sync::atomic::Ordering::Relaxed).max(1) as f32;
166
167        for term in query_terms {
168            let Some(term_entry) = self.index.get(&term) else { continue };
169            
170            // BM25-inspired scoring (simpler than full BM25)
171            let doc_freq = term_entry.len() as f32;
172            let idf = (1.0 + (total_docs - doc_freq + 0.5) / (doc_freq + 0.5)).ln();
173            
174            term_entry.iter().for_each(|(doc_id, tf)| {
175                // Apply BM25-like k1 and b parameters (simplified)
176                let k1 = 1.2;
177                let tf_adjusted = ((k1 + 1.0) * tf) / (k1 + tf);
178                
179                *scores.entry(doc_id.clone()).or_insert(0.0) += tf_adjusted * idf;
180            });
181        }
182
183        let mut results: Vec<_> = scores.into_iter().collect();
184        results.sort_by(|(_, score1), (_, score2)| score2.partial_cmp(score1).unwrap());
185        results
186    }
187
188    /// Search with fuzzy matching for typo tolerance
189    ///
190    /// # Arguments
191    /// * `query` - The search query text
192    /// * `max_distance` - Maximum edit distance for fuzzy matching
193    ///
194    /// # Returns
195    /// A vector of document IDs and relevance scores
196    ///
197    /// # Examples
198    /// ```
199    /// // Allow up to 2 character differences
200    /// let matches = index.fuzzy_search("wireles headpones", 2);
201    /// ```
202    #[allow(dead_code)]
203    pub fn fuzzy_search(&self, query: &str, max_distance: usize) -> Vec<(String, f32)> {
204        let query_terms = self.tokenize(query);
205        if query_terms.is_empty() {
206            return Vec::new();
207        }
208        
209        let mut scores: HashMap<String, f32> = HashMap::new();
210        
211        // Get all indexed terms
212        let all_terms: Vec<String> = self.index.iter().map(|e| e.key().clone()).collect();
213        
214        for query_term in query_terms {
215            // Find fuzzy matches
216            for indexed_term in &all_terms {
217                if query_term.len() > 3 && self.levenshtein_distance(&query_term, indexed_term) <= max_distance {
218                    if let Some(matches) = self.index.get(indexed_term) {
219                        // Discount score based on distance
220                        let distance = self.levenshtein_distance(&query_term, indexed_term) as f32;
221                        let distance_factor = 1.0 / (1.0 + distance);
222                        
223                        for (doc_id, score) in matches.iter() {
224                            *scores.entry(doc_id.clone()).or_insert(0.0) += score * distance_factor;
225                        }
226                    }
227                }
228            }
229        }
230        
231        let mut results: Vec<_> = scores.into_iter().collect();
232        results.sort_by(|(_, score1), (_, score2)| score2.partial_cmp(score1).unwrap());
233        results
234    }
235
236    /// Break text into tokens (words) for indexing or searching
237    ///
238    /// # Arguments
239    /// * `text` - The text to tokenize
240    ///
241    /// # Returns
242    /// A vector of token strings
243    fn tokenize(&self, text: &str) -> Vec<String> {
244        let mut tokens: Vec<String> = text.unicode_words()
245            .map(|word| word.to_lowercase())
246            .collect();
247            
248        // Filter out stop words if enabled
249        if self.enable_stop_words {
250            tokens.retain(|token| !self.stop_words.contains(token));
251        }
252        
253        tokens
254    }
255
256    /// Calculate term frequencies for a set of terms
257    ///
258    /// # Returns
259    /// A map of term to frequency
260    fn calculate_term_frequencies(&self, terms: &[String]) -> HashMap<String, f32> {
261        let mut freq = HashMap::new();
262        for term in terms {
263            *freq.entry(term.clone()).or_insert(0.0) += 1.0;
264        }
265        freq
266    }
267
268    /// Get the total number of documents in the index
269    ///
270    /// # Returns
271    /// The number of documents indexed
272    #[allow(dead_code)]
273    fn get_total_docs(&self) -> usize {
274        self.document_count.load(std::sync::atomic::Ordering::Relaxed)
275    }
276    
277    /// Calculate Levenshtein distance between two strings
278    ///
279    /// Used for fuzzy matching to handle typos and spelling variations
280    ///
281    /// # Arguments
282    /// * `s1` - First string
283    /// * `s2` - Second string
284    ///
285    /// # Returns
286    /// The edit distance (number of changes needed to transform s1 into s2)
287    fn levenshtein_distance(&self, s1: &str, s2: &str) -> usize {
288        let s1_len = s1.len();
289        let s2_len = s2.len();
290        
291        // Early return for edge cases
292        if s1_len == 0 { return s2_len; }
293        if s2_len == 0 { return s1_len; }
294        
295        let mut prev_row = (0..=s2_len).collect::<Vec<_>>();
296        let mut curr_row = vec![0; s2_len + 1];
297        
298        for i in 1..=s1_len {
299            curr_row[0] = i;
300            
301            for j in 1..=s2_len {
302                let cost = if s1.chars().nth(i-1) == s2.chars().nth(j-1) { 0 } else { 1 };
303                curr_row[j] = std::cmp::min(
304                    curr_row[j-1] + 1,  // insertion
305                    std::cmp::min(
306                        prev_row[j] + 1,  // deletion
307                        prev_row[j-1] + cost  // substitution
308                    )
309                );
310            }
311            
312            // Swap rows
313            std::mem::swap(&mut prev_row, &mut curr_row);
314        }
315        
316        prev_row[s2_len]
317    }
318    
319    /// Remove a document from the index
320    ///
321    /// # Arguments
322    /// * `doc_id` - ID of the document to remove
323    ///
324    /// # Examples
325    /// ```
326    /// index.remove_document("doc-123");
327    /// ```
328    #[allow(dead_code)]
329    pub fn remove_document(&self, doc_id: &str) {
330        // Remove the document from all term entries
331        for mut entry in self.index.iter_mut() {
332            let doc_entries = entry.value_mut();
333            let original_len = doc_entries.len();
334            doc_entries.retain(|(id, _)| id != doc_id);
335            
336            // If we removed an entry, decrement the document count
337            if doc_entries.len() < original_len {
338                self.document_count.fetch_sub(1, std::sync::atomic::Ordering::Relaxed);
339                // Only count the document once
340                break;
341            }
342        }
343    }
344
345    /// Add custom stopwords to the filter list
346    ///
347    /// # Arguments
348    /// * `words` - List of words to add to the stopword list
349    ///
350    /// # Examples
351    /// ```
352    /// // Add domain-specific stopwords
353    /// index.add_stopwords(&["widget", "product", "item"]);
354    /// ```
355    #[allow(dead_code)]
356    pub fn add_stopwords(&mut self, words: &[&str]) {
357        for word in words {
358            self.stop_words.insert(word.to_string());
359        }
360    }
361
362    /// Remove all stopwords from the filter
363    ///
364    /// # Examples
365    /// ```
366    /// // Allow searching for any word, including common ones
367    /// index.clear_stopwords();
368    /// ```
369    #[allow(dead_code)]
370    pub fn clear_stopwords(&mut self) {
371        self.stop_words.clear();
372    }
373
374    /// Get the current list of stopwords
375    ///
376    /// # Returns
377    /// A vector of stopwords currently in use
378    ///
379    /// # Examples
380    /// ```
381    /// let current_stopwords = index.get_stopwords();
382    /// println!("Currently filtering: {:?}", current_stopwords);
383    /// ```
384    #[allow(dead_code)]
385    pub fn get_stopwords(&self) -> Vec<String> {
386        self.stop_words.iter().cloned().collect()
387    }
388
389    #[allow(dead_code)]
390    #[allow(unused_variables)]
391    pub fn get_fuzzy_matches(&self, query: &str, max_distance: u32) -> Vec<(String, f32)> {
392        // ...
393        Vec::new()
394    }
395    
396    #[allow(dead_code)]
397    #[allow(unused_variables)]
398    pub fn highlight_matches(&self, text: &str, query: &str) -> String {
399        // ...
400        String::new()
401    }
402}
403
404#[cfg(test)]
405mod tests {
406    use super::*;
407    use std::collections::HashMap;
408
409    #[test]
410    fn test_full_text_index() -> Result<()> {
411        let index = FullTextIndex::new("test_collection", "content");
412
413        // Test document indexing
414        let doc1 = Document {
415            id: "1".to_string(),
416            data: {
417                let mut map = HashMap::new();
418                map.insert("content".to_string(), Value::String("The quick brown fox".to_string()));
419                map
420            },
421        };
422
423        let doc2 = Document {
424            id: "2".to_string(),
425            data: {
426                let mut map = HashMap::new();
427                map.insert("content".to_string(), Value::String("The lazy brown dog".to_string()));
428                map
429            },
430        };
431
432        index.index_document(&doc1)?;
433        index.index_document(&doc2)?;
434
435        // Test search functionality
436        let results = index.search("brown");
437        assert_eq!(results.len(), 2);
438        assert!(results.iter().any(|(id, _)| id == "1"));
439        assert!(results.iter().any(|(id, _)| id == "2"));
440
441        let results = index.search("quick");
442        assert_eq!(results.len(), 1);
443        assert_eq!(results[0].0, "1");
444
445        Ok(())
446    }
447
448    #[test]
449    fn test_tokenization() -> Result<()> {
450        let index = FullTextIndex::new("test", "content");
451        let tokens = index.tokenize("The Quick-Brown FOX!");
452        
453        assert_eq!(tokens, vec![
454            "the".to_string(),
455            "quick".to_string(),
456            "brown".to_string(),
457            "fox".to_string(),
458        ]);
459
460        Ok(())
461    }
462}