Skip to main content

engine/
fulltext.rs

1//! Full-text search with BM25 scoring
2//!
3//! Provides efficient text search capabilities using:
4//! - Inverted index for fast document lookup
5//! - BM25 scoring algorithm for relevance ranking
6//! - Configurable text analysis (tokenization, stemming, stop words)
7//! - Multi-language stemming support (Turbopuffer-inspired)
8
9use rust_stemmers::{Algorithm, Stemmer};
10use serde::{Deserialize, Serialize};
11use std::collections::{HashMap, HashSet};
12
13/// Supported languages for stemming (Turbopuffer-inspired)
14#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
15pub enum StemLanguage {
16    /// No stemming applied
17    #[default]
18    None,
19    /// Arabic
20    Arabic,
21    /// Danish
22    Danish,
23    /// Dutch
24    Dutch,
25    /// English (Porter algorithm)
26    English,
27    /// Finnish
28    Finnish,
29    /// French
30    French,
31    /// German
32    German,
33    /// Greek
34    Greek,
35    /// Hungarian
36    Hungarian,
37    /// Italian
38    Italian,
39    /// Norwegian
40    Norwegian,
41    /// Portuguese
42    Portuguese,
43    /// Romanian
44    Romanian,
45    /// Russian
46    Russian,
47    /// Spanish
48    Spanish,
49    /// Swedish
50    Swedish,
51    /// Tamil
52    Tamil,
53    /// Turkish
54    Turkish,
55}
56
57impl StemLanguage {
58    /// Convert to rust-stemmers Algorithm
59    fn to_algorithm(self) -> Option<Algorithm> {
60        match self {
61            StemLanguage::None => None,
62            StemLanguage::Arabic => Some(Algorithm::Arabic),
63            StemLanguage::Danish => Some(Algorithm::Danish),
64            StemLanguage::Dutch => Some(Algorithm::Dutch),
65            StemLanguage::English => Some(Algorithm::English),
66            StemLanguage::Finnish => Some(Algorithm::Finnish),
67            StemLanguage::French => Some(Algorithm::French),
68            StemLanguage::German => Some(Algorithm::German),
69            StemLanguage::Greek => Some(Algorithm::Greek),
70            StemLanguage::Hungarian => Some(Algorithm::Hungarian),
71            StemLanguage::Italian => Some(Algorithm::Italian),
72            StemLanguage::Norwegian => Some(Algorithm::Norwegian),
73            StemLanguage::Portuguese => Some(Algorithm::Portuguese),
74            StemLanguage::Romanian => Some(Algorithm::Romanian),
75            StemLanguage::Russian => Some(Algorithm::Russian),
76            StemLanguage::Spanish => Some(Algorithm::Spanish),
77            StemLanguage::Swedish => Some(Algorithm::Swedish),
78            StemLanguage::Tamil => Some(Algorithm::Tamil),
79            StemLanguage::Turkish => Some(Algorithm::Turkish),
80        }
81    }
82
83    /// Parse language from string (case-insensitive)
84    pub fn parse_str(s: &str) -> Option<Self> {
85        match s.to_lowercase().as_str() {
86            "none" | "" => Some(StemLanguage::None),
87            "arabic" | "ar" => Some(StemLanguage::Arabic),
88            "danish" | "da" => Some(StemLanguage::Danish),
89            "dutch" | "nl" => Some(StemLanguage::Dutch),
90            "english" | "en" => Some(StemLanguage::English),
91            "finnish" | "fi" => Some(StemLanguage::Finnish),
92            "french" | "fr" => Some(StemLanguage::French),
93            "german" | "de" => Some(StemLanguage::German),
94            "greek" | "el" => Some(StemLanguage::Greek),
95            "hungarian" | "hu" => Some(StemLanguage::Hungarian),
96            "italian" | "it" => Some(StemLanguage::Italian),
97            "norwegian" | "no" => Some(StemLanguage::Norwegian),
98            "portuguese" | "pt" => Some(StemLanguage::Portuguese),
99            "romanian" | "ro" => Some(StemLanguage::Romanian),
100            "russian" | "ru" => Some(StemLanguage::Russian),
101            "spanish" | "es" => Some(StemLanguage::Spanish),
102            "swedish" | "sv" => Some(StemLanguage::Swedish),
103            "tamil" | "ta" => Some(StemLanguage::Tamil),
104            "turkish" | "tr" => Some(StemLanguage::Turkish),
105            _ => None,
106        }
107    }
108
109    /// Get all supported language codes
110    pub fn supported_languages() -> &'static [&'static str] {
111        &[
112            "arabic",
113            "danish",
114            "dutch",
115            "english",
116            "finnish",
117            "french",
118            "german",
119            "greek",
120            "hungarian",
121            "italian",
122            "norwegian",
123            "portuguese",
124            "romanian",
125            "russian",
126            "spanish",
127            "swedish",
128            "tamil",
129            "turkish",
130        ]
131    }
132}
133
134/// Configuration for full-text search
135#[derive(Debug, Clone, Serialize, Deserialize)]
136pub struct FullTextConfig {
137    /// BM25 parameter k1 (term frequency saturation)
138    pub k1: f32,
139    /// BM25 parameter b (document length normalization)
140    pub b: f32,
141    /// Minimum token length to index
142    pub min_token_length: usize,
143    /// Maximum token length to index
144    pub max_token_length: usize,
145    /// Whether to apply lowercase normalization
146    pub lowercase: bool,
147    /// Stop words to filter out
148    pub stop_words: HashSet<String>,
149    /// Language for stemming (None = no stemming)
150    pub stem_language: StemLanguage,
151}
152
153impl Default for FullTextConfig {
154    fn default() -> Self {
155        Self {
156            k1: 1.2,
157            b: 0.75,
158            min_token_length: 2,
159            max_token_length: 50,
160            lowercase: true,
161            stop_words: default_stop_words(),
162            stem_language: StemLanguage::None,
163        }
164    }
165}
166
167impl FullTextConfig {
168    /// Create config with English stemming enabled
169    pub fn with_english_stemming() -> Self {
170        Self {
171            stem_language: StemLanguage::English,
172            ..Default::default()
173        }
174    }
175
176    /// Create config with specified language stemming
177    pub fn with_language(language: StemLanguage) -> Self {
178        Self {
179            stem_language: language,
180            ..Default::default()
181        }
182    }
183}
184
185/// Returns common English stop words
186fn default_stop_words() -> HashSet<String> {
187    [
188        "a", "an", "and", "are", "as", "at", "be", "by", "for", "from", "has", "he", "in", "is",
189        "it", "its", "of", "on", "or", "that", "the", "to", "was", "were", "will", "with", "this",
190        "but", "they", "have", "had", "what", "when", "where", "who", "which", "why", "how",
191    ]
192    .iter()
193    .map(|s| s.to_string())
194    .collect()
195}
196
197/// Text analyzer for tokenization, normalization, and stemming
198#[derive(Debug, Clone, Serialize)]
199pub struct TextAnalyzer {
200    config: FullTextConfig,
201    /// Not serialized — reconstructed from config.stem_language on deserialization.
202    #[serde(skip)]
203    stem_algorithm: Option<Algorithm>,
204}
205
206impl<'de> Deserialize<'de> for TextAnalyzer {
207    fn deserialize<D: serde::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
208        #[derive(Deserialize)]
209        struct Helper {
210            config: FullTextConfig,
211        }
212        let h = Helper::deserialize(d)?;
213        Ok(TextAnalyzer::new(h.config))
214    }
215}
216
217impl TextAnalyzer {
218    pub fn new(config: FullTextConfig) -> Self {
219        let stem_algorithm = config.stem_language.to_algorithm();
220        Self {
221            config,
222            stem_algorithm,
223        }
224    }
225
226    /// Analyze text into tokens (with optional stemming)
227    pub fn analyze(&self, text: &str) -> Vec<String> {
228        let text = if self.config.lowercase {
229            text.to_lowercase()
230        } else {
231            text.to_string()
232        };
233
234        // Split on non-alphanumeric characters
235        let tokens: Vec<String> = text
236            .split(|c: char| !c.is_alphanumeric())
237            .filter(|token| {
238                let len = token.len();
239                len >= self.config.min_token_length
240                    && len <= self.config.max_token_length
241                    && !self.config.stop_words.contains(*token)
242            })
243            .map(|s| s.to_string())
244            .collect();
245
246        // Apply stemming if configured
247        if let Some(algorithm) = self.stem_algorithm {
248            let stemmer = Stemmer::create(algorithm);
249            tokens
250                .into_iter()
251                .map(|token| stemmer.stem(&token).to_string())
252                .collect()
253        } else {
254            tokens
255        }
256    }
257
258    /// Get token frequencies from text
259    pub fn token_frequencies(&self, text: &str) -> HashMap<String, u32> {
260        let mut freqs = HashMap::new();
261        for token in self.analyze(text) {
262            *freqs.entry(token).or_insert(0) += 1;
263        }
264        freqs
265    }
266
267    /// Get the configured stemming language
268    pub fn stem_language(&self) -> StemLanguage {
269        self.config.stem_language
270    }
271
272    /// Check if stemming is enabled
273    pub fn stemming_enabled(&self) -> bool {
274        self.stem_algorithm.is_some()
275    }
276}
277
278impl Default for TextAnalyzer {
279    fn default() -> Self {
280        Self::new(FullTextConfig::default())
281    }
282}
283
284/// Posting list entry for inverted index
285#[derive(Debug, Clone, Serialize, Deserialize)]
286pub struct Posting {
287    /// Document ID
288    pub doc_id: String,
289    /// Term frequency in document
290    pub term_freq: u32,
291    /// Positions of term in document (optional)
292    pub positions: Vec<u32>,
293}
294
295/// Inverted index for full-text search
296#[derive(Debug, Clone, Serialize, Deserialize)]
297pub struct InvertedIndex {
298    /// Token -> list of postings
299    index: HashMap<String, Vec<Posting>>,
300    /// Document ID -> document length (token count)
301    doc_lengths: HashMap<String, u32>,
302    /// Document ID -> metadata (optional)
303    doc_metadata: HashMap<String, serde_json::Value>,
304    /// Total number of documents
305    doc_count: u32,
306    /// Average document length
307    avg_doc_length: f32,
308    /// Text analyzer
309    analyzer: TextAnalyzer,
310    /// BM25 config
311    config: FullTextConfig,
312}
313
314impl InvertedIndex {
315    pub fn new(config: FullTextConfig) -> Self {
316        let analyzer = TextAnalyzer::new(config.clone());
317        Self {
318            index: HashMap::new(),
319            doc_lengths: HashMap::new(),
320            doc_metadata: HashMap::new(),
321            doc_count: 0,
322            avg_doc_length: 0.0,
323            analyzer,
324            config,
325        }
326    }
327
328    /// Add a document to the index
329    pub fn add_document(&mut self, doc_id: &str, text: &str) {
330        self.add_document_with_metadata(doc_id, text, None);
331    }
332
333    /// Add a document with optional metadata to the index
334    pub fn add_document_with_metadata(
335        &mut self,
336        doc_id: &str,
337        text: &str,
338        metadata: Option<serde_json::Value>,
339    ) {
340        // Remove existing document if present
341        self.remove_document(doc_id);
342
343        // Store metadata if provided
344        if let Some(meta) = metadata {
345            self.doc_metadata.insert(doc_id.to_string(), meta);
346        }
347
348        let tokens = self.analyzer.analyze(text);
349        let doc_length = tokens.len() as u32;
350
351        // Update document length tracking
352        self.doc_lengths.insert(doc_id.to_string(), doc_length);
353        self.doc_count += 1;
354
355        // Recalculate average document length
356        let total_length: u32 = self.doc_lengths.values().sum();
357        self.avg_doc_length = total_length as f32 / self.doc_count as f32;
358
359        // Build term frequencies and positions
360        let mut term_data: HashMap<String, (u32, Vec<u32>)> = HashMap::new();
361        for (pos, token) in tokens.into_iter().enumerate() {
362            let entry = term_data.entry(token).or_insert((0, Vec::new()));
363            entry.0 += 1;
364            entry.1.push(pos as u32);
365        }
366
367        // Add postings to index
368        for (token, (freq, positions)) in term_data {
369            let posting = Posting {
370                doc_id: doc_id.to_string(),
371                term_freq: freq,
372                positions,
373            };
374            self.index.entry(token).or_default().push(posting);
375        }
376    }
377
378    /// Remove a document from the index
379    pub fn remove_document(&mut self, doc_id: &str) -> bool {
380        if self.doc_lengths.remove(doc_id).is_none() {
381            return false;
382        }
383
384        // Remove metadata if present
385        self.doc_metadata.remove(doc_id);
386
387        self.doc_count = self.doc_count.saturating_sub(1);
388
389        // Recalculate average
390        if self.doc_count > 0 {
391            let total_length: u32 = self.doc_lengths.values().sum();
392            self.avg_doc_length = total_length as f32 / self.doc_count as f32;
393        } else {
394            self.avg_doc_length = 0.0;
395        }
396
397        // Remove from all posting lists
398        for postings in self.index.values_mut() {
399            postings.retain(|p| p.doc_id != doc_id);
400        }
401
402        // Clean up empty posting lists
403        self.index.retain(|_, v| !v.is_empty());
404
405        true
406    }
407
408    /// Search the index using BM25 scoring
409    pub fn search(&self, query: &str, top_k: usize) -> Vec<FullTextResult> {
410        let query_tokens = self.analyzer.analyze(query);
411        if query_tokens.is_empty() {
412            return Vec::new();
413        }
414
415        // Calculate BM25 scores for each document
416        let mut scores: HashMap<String, f32> = HashMap::new();
417
418        for token in &query_tokens {
419            if let Some(postings) = self.index.get(token) {
420                let idf = self.calculate_idf(postings.len());
421
422                for posting in postings {
423                    let doc_length = self.doc_lengths.get(&posting.doc_id).copied().unwrap_or(0);
424                    let tf_score = self.calculate_tf(posting.term_freq, doc_length);
425                    let score = idf * tf_score;
426
427                    *scores.entry(posting.doc_id.clone()).or_insert(0.0) += score;
428                }
429            }
430        }
431
432        // Sort by score descending
433        let mut results: Vec<_> = scores
434            .into_iter()
435            .map(|(doc_id, score)| FullTextResult { doc_id, score })
436            .collect();
437
438        results.sort_by(|a, b| {
439            b.score
440                .partial_cmp(&a.score)
441                .unwrap_or(std::cmp::Ordering::Equal)
442        });
443        results.truncate(top_k);
444        results
445    }
446
447    /// Calculate IDF (Inverse Document Frequency)
448    fn calculate_idf(&self, doc_freq: usize) -> f32 {
449        let n = self.doc_count as f32;
450        let df = doc_freq as f32;
451        ((n - df + 0.5) / (df + 0.5) + 1.0).ln()
452    }
453
454    /// Calculate TF (Term Frequency) component with BM25 normalization
455    fn calculate_tf(&self, term_freq: u32, doc_length: u32) -> f32 {
456        let tf = term_freq as f32;
457        let dl = doc_length as f32;
458        let avgdl = self.avg_doc_length;
459        let k1 = self.config.k1;
460        let b = self.config.b;
461
462        let length_norm = 1.0 - b + b * (dl / avgdl);
463        (tf * (k1 + 1.0)) / (tf + k1 * length_norm)
464    }
465
466    /// Get index statistics
467    pub fn stats(&self) -> FullTextStats {
468        FullTextStats {
469            document_count: self.doc_count as usize,
470            unique_terms: self.index.len(),
471            avg_document_length: self.avg_doc_length,
472            total_postings: self.index.values().map(|v| v.len()).sum(),
473        }
474    }
475
476    /// Check if a document exists in the index
477    pub fn contains(&self, doc_id: &str) -> bool {
478        self.doc_lengths.contains_key(doc_id)
479    }
480
481    /// Get metadata for a document
482    pub fn get_metadata(&self, doc_id: &str) -> Option<&serde_json::Value> {
483        self.doc_metadata.get(doc_id)
484    }
485
486    /// Search the index with an optional metadata filter
487    pub fn search_with_filter(
488        &self,
489        query: &str,
490        top_k: usize,
491        filter: Option<&common::FilterExpression>,
492    ) -> Vec<FullTextResult> {
493        // Get initial results
494        let results = self.search(query, top_k * 2); // Get more to account for filtering
495
496        // Apply filter if provided
497        if let Some(filter_expr) = filter {
498            use crate::filter::evaluate_filter;
499            results
500                .into_iter()
501                .filter(|r| evaluate_filter(filter_expr, self.doc_metadata.get(&r.doc_id)))
502                .take(top_k)
503                .collect()
504        } else {
505            results.into_iter().take(top_k).collect()
506        }
507    }
508
509    /// Get the number of documents
510    pub fn len(&self) -> usize {
511        self.doc_count as usize
512    }
513
514    /// Check if the index is empty
515    pub fn is_empty(&self) -> bool {
516        self.doc_count == 0
517    }
518
519    /// Clear the index
520    pub fn clear(&mut self) {
521        self.index.clear();
522        self.doc_lengths.clear();
523        self.doc_count = 0;
524        self.avg_doc_length = 0.0;
525    }
526}
527
528impl Default for InvertedIndex {
529    fn default() -> Self {
530        Self::new(FullTextConfig::default())
531    }
532}
533
534/// Full-text search result
535#[derive(Debug, Clone)]
536pub struct FullTextResult {
537    /// Document ID
538    pub doc_id: String,
539    /// BM25 score
540    pub score: f32,
541}
542
543/// Full-text index statistics
544#[derive(Debug, Clone)]
545pub struct FullTextStats {
546    /// Number of indexed documents
547    pub document_count: usize,
548    /// Number of unique terms
549    pub unique_terms: usize,
550    /// Average document length
551    pub avg_document_length: f32,
552    /// Total number of postings
553    pub total_postings: usize,
554}
555
556#[cfg(test)]
557mod tests {
558    use super::*;
559
560    #[test]
561    fn test_text_analyzer_basic() {
562        let analyzer = TextAnalyzer::default();
563        let tokens = analyzer.analyze("Hello World! This is a test.");
564
565        // "a" is a stop word, "is" is a stop word
566        assert!(tokens.contains(&"hello".to_string()));
567        assert!(tokens.contains(&"world".to_string()));
568        assert!(tokens.contains(&"test".to_string()));
569        assert!(!tokens.contains(&"a".to_string())); // stop word
570    }
571
572    #[test]
573    fn test_text_analyzer_case_insensitive() {
574        let analyzer = TextAnalyzer::default();
575        let tokens = analyzer.analyze("HELLO hello HeLLo");
576
577        assert_eq!(tokens.iter().filter(|t| *t == "hello").count(), 3);
578    }
579
580    #[test]
581    fn test_text_analyzer_token_length() {
582        let mut config = FullTextConfig::default();
583        config.min_token_length = 3;
584        config.max_token_length = 5;
585        let analyzer = TextAnalyzer::new(config);
586
587        let tokens = analyzer.analyze("a ab abc abcd abcde abcdef");
588
589        assert!(!tokens.contains(&"a".to_string()));
590        assert!(!tokens.contains(&"ab".to_string()));
591        assert!(tokens.contains(&"abc".to_string()));
592        assert!(tokens.contains(&"abcd".to_string()));
593        assert!(tokens.contains(&"abcde".to_string()));
594        assert!(!tokens.contains(&"abcdef".to_string()));
595    }
596
597    #[test]
598    fn test_token_frequencies() {
599        let analyzer = TextAnalyzer::default();
600        let freqs = analyzer.token_frequencies("hello hello world hello");
601
602        assert_eq!(freqs.get("hello"), Some(&3));
603        assert_eq!(freqs.get("world"), Some(&1));
604    }
605
606    #[test]
607    fn test_inverted_index_add_and_search() {
608        let mut index = InvertedIndex::default();
609
610        index.add_document("doc1", "The quick brown fox jumps over the lazy dog");
611        index.add_document("doc2", "A quick brown dog runs in the park");
612        index.add_document("doc3", "The lazy cat sleeps all day");
613
614        let results = index.search("quick brown", 10);
615
616        // doc1 and doc2 both have "quick" and "brown"
617        assert!(!results.is_empty());
618        assert!(results.iter().any(|r| r.doc_id == "doc1"));
619        assert!(results.iter().any(|r| r.doc_id == "doc2"));
620        // doc3 doesn't have these terms
621        assert!(!results.iter().any(|r| r.doc_id == "doc3"));
622    }
623
624    #[test]
625    fn test_inverted_index_ranking() {
626        let mut index = InvertedIndex::default();
627
628        // doc1 has "rust" twice
629        index.add_document("doc1", "rust is awesome, rust programming");
630        // doc2 has "rust" once
631        index.add_document("doc2", "rust programming language");
632        // doc3 doesn't have "rust"
633        index.add_document("doc3", "python programming language");
634
635        let results = index.search("rust", 10);
636
637        assert_eq!(results.len(), 2);
638        // doc1 should rank higher (more occurrences)
639        assert_eq!(results[0].doc_id, "doc1");
640        assert_eq!(results[1].doc_id, "doc2");
641        assert!(results[0].score > results[1].score);
642    }
643
644    #[test]
645    fn test_inverted_index_remove() {
646        let mut index = InvertedIndex::default();
647
648        index.add_document("doc1", "hello world");
649        index.add_document("doc2", "hello universe");
650
651        assert_eq!(index.len(), 2);
652
653        let removed = index.remove_document("doc1");
654        assert!(removed);
655        assert_eq!(index.len(), 1);
656
657        let results = index.search("hello", 10);
658        assert_eq!(results.len(), 1);
659        assert_eq!(results[0].doc_id, "doc2");
660    }
661
662    #[test]
663    fn test_inverted_index_update() {
664        let mut index = InvertedIndex::default();
665
666        index.add_document("doc1", "original content about cats");
667        let results1 = index.search("cats", 10);
668        assert_eq!(results1.len(), 1);
669
670        // Update document
671        index.add_document("doc1", "updated content about dogs");
672
673        let results2 = index.search("cats", 10);
674        assert_eq!(results2.len(), 0);
675
676        let results3 = index.search("dogs", 10);
677        assert_eq!(results3.len(), 1);
678    }
679
680    #[test]
681    fn test_inverted_index_empty_query() {
682        let mut index = InvertedIndex::default();
683        index.add_document("doc1", "hello world");
684
685        // Query with only stop words
686        let results = index.search("the is a", 10);
687        assert!(results.is_empty());
688    }
689
690    #[test]
691    fn test_inverted_index_stats() {
692        let mut index = InvertedIndex::default();
693
694        index.add_document("doc1", "hello world test");
695        index.add_document("doc2", "hello universe example");
696
697        let stats = index.stats();
698
699        assert_eq!(stats.document_count, 2);
700        assert!(stats.unique_terms > 0);
701        assert!(stats.avg_document_length > 0.0);
702        assert!(stats.total_postings > 0);
703    }
704
705    #[test]
706    fn test_inverted_index_clear() {
707        let mut index = InvertedIndex::default();
708
709        index.add_document("doc1", "hello world");
710        index.add_document("doc2", "hello universe");
711
712        assert_eq!(index.len(), 2);
713
714        index.clear();
715
716        assert_eq!(index.len(), 0);
717        assert!(index.is_empty());
718        assert_eq!(index.stats().unique_terms, 0);
719    }
720
721    #[test]
722    fn test_bm25_idf() {
723        let _index = InvertedIndex::default();
724
725        // Test IDF formula
726        // When df is small relative to N, IDF should be higher
727        // This tests the mathematical properties
728    }
729
730    #[test]
731    fn test_bm25_length_normalization() {
732        let mut index = InvertedIndex::default();
733
734        // Short document with "rust"
735        index.add_document("short", "rust");
736        // Long document with "rust" once
737        index.add_document(
738            "long",
739            "rust programming language framework library ecosystem tools community",
740        );
741
742        let results = index.search("rust", 10);
743
744        // Short document should score higher due to length normalization
745        assert_eq!(results.len(), 2);
746        // The shorter doc has higher term density
747    }
748
749    #[test]
750    fn test_contains() {
751        let mut index = InvertedIndex::default();
752
753        index.add_document("doc1", "hello world");
754
755        assert!(index.contains("doc1"));
756        assert!(!index.contains("doc2"));
757    }
758
759    #[test]
760    fn test_custom_config() {
761        let config = FullTextConfig {
762            k1: 1.5,
763            b: 0.5,
764            min_token_length: 1,
765            max_token_length: 100,
766            lowercase: false,
767            stop_words: HashSet::new(),
768            stem_language: StemLanguage::None,
769        };
770
771        let mut index = InvertedIndex::new(config);
772        index.add_document("doc1", "A B C");
773
774        // With no stop words and min length 1, all tokens should be indexed
775        let results = index.search("A", 10);
776        assert_eq!(results.len(), 1);
777    }
778
779    #[test]
780    fn test_special_characters() {
781        let mut index = InvertedIndex::default();
782
783        index.add_document("doc1", "hello@world.com test-case under_score");
784
785        // Should split on special characters
786        let results = index.search("hello", 10);
787        assert_eq!(results.len(), 1);
788
789        let results = index.search("world", 10);
790        assert_eq!(results.len(), 1);
791
792        let results = index.search("test", 10);
793        assert_eq!(results.len(), 1);
794    }
795
796    #[test]
797    fn test_numeric_tokens() {
798        let mut index = InvertedIndex::default();
799
800        index.add_document("doc1", "version 123 release 2024");
801
802        let results = index.search("123", 10);
803        assert_eq!(results.len(), 1);
804
805        let results = index.search("2024", 10);
806        assert_eq!(results.len(), 1);
807    }
808
809    #[test]
810    fn test_phrase_search_basic() {
811        let mut index = InvertedIndex::default();
812
813        index.add_document("doc1", "quick brown fox");
814        index.add_document("doc2", "brown quick fox");
815
816        // Both documents contain "quick" and "brown"
817        let results = index.search("quick brown", 10);
818        assert_eq!(results.len(), 2);
819    }
820
821    // =========================================================================
822    // Stemming Tests (Turbopuffer-inspired multi-language support)
823    // =========================================================================
824
825    #[test]
826    fn test_english_stemming() {
827        let config = FullTextConfig::with_english_stemming();
828        let analyzer = TextAnalyzer::new(config);
829
830        // "running" -> "run", "jumped" -> "jump", "cats" -> "cat"
831        let tokens = analyzer.analyze("The cats were running and jumping");
832
833        assert!(tokens.contains(&"cat".to_string())); // stemmed from "cats"
834        assert!(tokens.contains(&"run".to_string())); // stemmed from "running"
835        assert!(tokens.contains(&"jump".to_string())); // stemmed from "jumping"
836    }
837
838    #[test]
839    fn test_english_stemming_search() {
840        let config = FullTextConfig::with_english_stemming();
841        let mut index = InvertedIndex::new(config);
842
843        index.add_document("doc1", "The programmer is programming applications");
844        index.add_document("doc2", "Software development requires developers");
845        index.add_document("doc3", "Cooking recipes for beginners");
846
847        // Search for "program" should match doc1 (programmer, programming -> program)
848        let results = index.search("program", 10);
849        assert_eq!(results.len(), 1);
850        assert_eq!(results[0].doc_id, "doc1");
851
852        // Search for "develop" should match doc2 (development, developers -> develop)
853        let results = index.search("develop", 10);
854        assert_eq!(results.len(), 1);
855        assert_eq!(results[0].doc_id, "doc2");
856
857        // Search for "programming" should also match doc1 (both query and doc are stemmed)
858        let results = index.search("programming", 10);
859        assert_eq!(results.len(), 1);
860        assert_eq!(results[0].doc_id, "doc1");
861    }
862
863    #[test]
864    fn test_german_stemming() {
865        let config = FullTextConfig::with_language(StemLanguage::German);
866        let analyzer = TextAnalyzer::new(config);
867
868        // German stemming test
869        let _tokens = analyzer.analyze("Die Entwickler entwickeln Software");
870
871        // Both "Entwickler" and "entwickeln" should stem to similar root
872        assert!(analyzer.stemming_enabled());
873        assert_eq!(analyzer.stem_language(), StemLanguage::German);
874    }
875
876    #[test]
877    fn test_french_stemming() {
878        let config = FullTextConfig::with_language(StemLanguage::French);
879        let analyzer = TextAnalyzer::new(config);
880
881        let tokens = analyzer.analyze("Les programmeurs programment des applications");
882
883        assert!(analyzer.stemming_enabled());
884        assert_eq!(analyzer.stem_language(), StemLanguage::French);
885        assert!(!tokens.is_empty());
886    }
887
888    #[test]
889    fn test_spanish_stemming() {
890        let config = FullTextConfig::with_language(StemLanguage::Spanish);
891        let analyzer = TextAnalyzer::new(config);
892
893        let tokens = analyzer.analyze("Los desarrolladores desarrollan aplicaciones");
894
895        assert!(analyzer.stemming_enabled());
896        assert_eq!(analyzer.stem_language(), StemLanguage::Spanish);
897        assert!(!tokens.is_empty());
898    }
899
900    #[test]
901    fn test_no_stemming_default() {
902        let analyzer = TextAnalyzer::default();
903
904        // Default should not apply stemming
905        assert!(!analyzer.stemming_enabled());
906        assert_eq!(analyzer.stem_language(), StemLanguage::None);
907
908        let tokens = analyzer.analyze("running jumped cats");
909
910        // Without stemming, tokens stay as-is
911        assert!(tokens.contains(&"running".to_string()));
912        assert!(tokens.contains(&"jumped".to_string()));
913        assert!(tokens.contains(&"cats".to_string()));
914    }
915
916    #[test]
917    fn test_stem_language_from_str() {
918        assert_eq!(
919            StemLanguage::parse_str("english"),
920            Some(StemLanguage::English)
921        );
922        assert_eq!(StemLanguage::parse_str("en"), Some(StemLanguage::English));
923        assert_eq!(
924            StemLanguage::parse_str("ENGLISH"),
925            Some(StemLanguage::English)
926        );
927        assert_eq!(
928            StemLanguage::parse_str("french"),
929            Some(StemLanguage::French)
930        );
931        assert_eq!(StemLanguage::parse_str("fr"), Some(StemLanguage::French));
932        assert_eq!(
933            StemLanguage::parse_str("german"),
934            Some(StemLanguage::German)
935        );
936        assert_eq!(StemLanguage::parse_str("de"), Some(StemLanguage::German));
937        assert_eq!(
938            StemLanguage::parse_str("spanish"),
939            Some(StemLanguage::Spanish)
940        );
941        assert_eq!(StemLanguage::parse_str("es"), Some(StemLanguage::Spanish));
942        assert_eq!(StemLanguage::parse_str("none"), Some(StemLanguage::None));
943        assert_eq!(StemLanguage::parse_str(""), Some(StemLanguage::None));
944        assert_eq!(StemLanguage::parse_str("invalid"), None);
945    }
946
947    #[test]
948    fn test_supported_languages() {
949        let languages = StemLanguage::supported_languages();
950        assert!(languages.contains(&"english"));
951        assert!(languages.contains(&"french"));
952        assert!(languages.contains(&"german"));
953        assert!(languages.contains(&"spanish"));
954        assert!(languages.contains(&"italian"));
955        assert!(languages.contains(&"portuguese"));
956        assert!(languages.contains(&"russian"));
957        assert!(languages.contains(&"arabic"));
958        assert_eq!(languages.len(), 18); // All 18 supported languages
959    }
960}