1use rust_stemmers::{Algorithm, Stemmer};
10use serde::{Deserialize, Serialize};
11use std::collections::{HashMap, HashSet};
12
13#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
15pub enum StemLanguage {
16 #[default]
18 None,
19 Arabic,
21 Danish,
23 Dutch,
25 English,
27 Finnish,
29 French,
31 German,
33 Greek,
35 Hungarian,
37 Italian,
39 Norwegian,
41 Portuguese,
43 Romanian,
45 Russian,
47 Spanish,
49 Swedish,
51 Tamil,
53 Turkish,
55}
56
57impl StemLanguage {
58 fn to_algorithm(self) -> Option<Algorithm> {
60 match self {
61 StemLanguage::None => None,
62 StemLanguage::Arabic => Some(Algorithm::Arabic),
63 StemLanguage::Danish => Some(Algorithm::Danish),
64 StemLanguage::Dutch => Some(Algorithm::Dutch),
65 StemLanguage::English => Some(Algorithm::English),
66 StemLanguage::Finnish => Some(Algorithm::Finnish),
67 StemLanguage::French => Some(Algorithm::French),
68 StemLanguage::German => Some(Algorithm::German),
69 StemLanguage::Greek => Some(Algorithm::Greek),
70 StemLanguage::Hungarian => Some(Algorithm::Hungarian),
71 StemLanguage::Italian => Some(Algorithm::Italian),
72 StemLanguage::Norwegian => Some(Algorithm::Norwegian),
73 StemLanguage::Portuguese => Some(Algorithm::Portuguese),
74 StemLanguage::Romanian => Some(Algorithm::Romanian),
75 StemLanguage::Russian => Some(Algorithm::Russian),
76 StemLanguage::Spanish => Some(Algorithm::Spanish),
77 StemLanguage::Swedish => Some(Algorithm::Swedish),
78 StemLanguage::Tamil => Some(Algorithm::Tamil),
79 StemLanguage::Turkish => Some(Algorithm::Turkish),
80 }
81 }
82
83 pub fn parse_str(s: &str) -> Option<Self> {
85 match s.to_lowercase().as_str() {
86 "none" | "" => Some(StemLanguage::None),
87 "arabic" | "ar" => Some(StemLanguage::Arabic),
88 "danish" | "da" => Some(StemLanguage::Danish),
89 "dutch" | "nl" => Some(StemLanguage::Dutch),
90 "english" | "en" => Some(StemLanguage::English),
91 "finnish" | "fi" => Some(StemLanguage::Finnish),
92 "french" | "fr" => Some(StemLanguage::French),
93 "german" | "de" => Some(StemLanguage::German),
94 "greek" | "el" => Some(StemLanguage::Greek),
95 "hungarian" | "hu" => Some(StemLanguage::Hungarian),
96 "italian" | "it" => Some(StemLanguage::Italian),
97 "norwegian" | "no" => Some(StemLanguage::Norwegian),
98 "portuguese" | "pt" => Some(StemLanguage::Portuguese),
99 "romanian" | "ro" => Some(StemLanguage::Romanian),
100 "russian" | "ru" => Some(StemLanguage::Russian),
101 "spanish" | "es" => Some(StemLanguage::Spanish),
102 "swedish" | "sv" => Some(StemLanguage::Swedish),
103 "tamil" | "ta" => Some(StemLanguage::Tamil),
104 "turkish" | "tr" => Some(StemLanguage::Turkish),
105 _ => None,
106 }
107 }
108
109 pub fn supported_languages() -> &'static [&'static str] {
111 &[
112 "arabic",
113 "danish",
114 "dutch",
115 "english",
116 "finnish",
117 "french",
118 "german",
119 "greek",
120 "hungarian",
121 "italian",
122 "norwegian",
123 "portuguese",
124 "romanian",
125 "russian",
126 "spanish",
127 "swedish",
128 "tamil",
129 "turkish",
130 ]
131 }
132}
133
134#[derive(Debug, Clone, Serialize, Deserialize)]
136pub struct FullTextConfig {
137 pub k1: f32,
139 pub b: f32,
141 pub min_token_length: usize,
143 pub max_token_length: usize,
145 pub lowercase: bool,
147 pub stop_words: HashSet<String>,
149 pub stem_language: StemLanguage,
151}
152
153impl Default for FullTextConfig {
154 fn default() -> Self {
155 Self {
156 k1: 1.2,
157 b: 0.75,
158 min_token_length: 2,
159 max_token_length: 50,
160 lowercase: true,
161 stop_words: default_stop_words(),
162 stem_language: StemLanguage::None,
163 }
164 }
165}
166
167impl FullTextConfig {
168 pub fn with_english_stemming() -> Self {
170 Self {
171 stem_language: StemLanguage::English,
172 ..Default::default()
173 }
174 }
175
176 pub fn with_language(language: StemLanguage) -> Self {
178 Self {
179 stem_language: language,
180 ..Default::default()
181 }
182 }
183}
184
185fn default_stop_words() -> HashSet<String> {
187 [
188 "a", "an", "and", "are", "as", "at", "be", "by", "for", "from", "has", "he", "in", "is",
189 "it", "its", "of", "on", "or", "that", "the", "to", "was", "were", "will", "with", "this",
190 "but", "they", "have", "had", "what", "when", "where", "who", "which", "why", "how",
191 ]
192 .iter()
193 .map(|s| s.to_string())
194 .collect()
195}
196
197#[derive(Debug, Clone, Serialize)]
199pub struct TextAnalyzer {
200 config: FullTextConfig,
201 #[serde(skip)]
203 stem_algorithm: Option<Algorithm>,
204}
205
206impl<'de> Deserialize<'de> for TextAnalyzer {
207 fn deserialize<D: serde::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
208 #[derive(Deserialize)]
209 struct Helper {
210 config: FullTextConfig,
211 }
212 let h = Helper::deserialize(d)?;
213 Ok(TextAnalyzer::new(h.config))
214 }
215}
216
217impl TextAnalyzer {
218 pub fn new(config: FullTextConfig) -> Self {
219 let stem_algorithm = config.stem_language.to_algorithm();
220 Self {
221 config,
222 stem_algorithm,
223 }
224 }
225
226 pub fn analyze(&self, text: &str) -> Vec<String> {
228 let text = if self.config.lowercase {
229 text.to_lowercase()
230 } else {
231 text.to_string()
232 };
233
234 let tokens: Vec<String> = text
236 .split(|c: char| !c.is_alphanumeric())
237 .filter(|token| {
238 let len = token.len();
239 len >= self.config.min_token_length
240 && len <= self.config.max_token_length
241 && !self.config.stop_words.contains(*token)
242 })
243 .map(|s| s.to_string())
244 .collect();
245
246 if let Some(algorithm) = self.stem_algorithm {
248 let stemmer = Stemmer::create(algorithm);
249 tokens
250 .into_iter()
251 .map(|token| stemmer.stem(&token).to_string())
252 .collect()
253 } else {
254 tokens
255 }
256 }
257
258 pub fn token_frequencies(&self, text: &str) -> HashMap<String, u32> {
260 let mut freqs = HashMap::new();
261 for token in self.analyze(text) {
262 *freqs.entry(token).or_insert(0) += 1;
263 }
264 freqs
265 }
266
267 pub fn stem_language(&self) -> StemLanguage {
269 self.config.stem_language
270 }
271
272 pub fn stemming_enabled(&self) -> bool {
274 self.stem_algorithm.is_some()
275 }
276}
277
278impl Default for TextAnalyzer {
279 fn default() -> Self {
280 Self::new(FullTextConfig::default())
281 }
282}
283
284#[derive(Debug, Clone, Serialize, Deserialize)]
286pub struct Posting {
287 pub doc_id: String,
289 pub term_freq: u32,
291 pub positions: Vec<u32>,
293}
294
295#[derive(Debug, Clone, Serialize, Deserialize)]
297pub struct InvertedIndex {
298 index: HashMap<String, Vec<Posting>>,
300 doc_lengths: HashMap<String, u32>,
302 doc_metadata: HashMap<String, serde_json::Value>,
304 doc_count: u32,
306 avg_doc_length: f32,
308 analyzer: TextAnalyzer,
310 config: FullTextConfig,
312}
313
314impl InvertedIndex {
315 pub fn new(config: FullTextConfig) -> Self {
316 let analyzer = TextAnalyzer::new(config.clone());
317 Self {
318 index: HashMap::new(),
319 doc_lengths: HashMap::new(),
320 doc_metadata: HashMap::new(),
321 doc_count: 0,
322 avg_doc_length: 0.0,
323 analyzer,
324 config,
325 }
326 }
327
328 pub fn add_document(&mut self, doc_id: &str, text: &str) {
330 self.add_document_with_metadata(doc_id, text, None);
331 }
332
333 pub fn add_document_with_metadata(
335 &mut self,
336 doc_id: &str,
337 text: &str,
338 metadata: Option<serde_json::Value>,
339 ) {
340 self.remove_document(doc_id);
342
343 if let Some(meta) = metadata {
345 self.doc_metadata.insert(doc_id.to_string(), meta);
346 }
347
348 let tokens = self.analyzer.analyze(text);
349 let doc_length = tokens.len() as u32;
350
351 self.doc_lengths.insert(doc_id.to_string(), doc_length);
353 self.doc_count += 1;
354
355 let total_length: u32 = self.doc_lengths.values().sum();
357 self.avg_doc_length = total_length as f32 / self.doc_count as f32;
358
359 let mut term_data: HashMap<String, (u32, Vec<u32>)> = HashMap::new();
361 for (pos, token) in tokens.into_iter().enumerate() {
362 let entry = term_data.entry(token).or_insert((0, Vec::new()));
363 entry.0 += 1;
364 entry.1.push(pos as u32);
365 }
366
367 for (token, (freq, positions)) in term_data {
369 let posting = Posting {
370 doc_id: doc_id.to_string(),
371 term_freq: freq,
372 positions,
373 };
374 self.index.entry(token).or_default().push(posting);
375 }
376 }
377
378 pub fn remove_document(&mut self, doc_id: &str) -> bool {
380 if self.doc_lengths.remove(doc_id).is_none() {
381 return false;
382 }
383
384 self.doc_metadata.remove(doc_id);
386
387 self.doc_count = self.doc_count.saturating_sub(1);
388
389 if self.doc_count > 0 {
391 let total_length: u32 = self.doc_lengths.values().sum();
392 self.avg_doc_length = total_length as f32 / self.doc_count as f32;
393 } else {
394 self.avg_doc_length = 0.0;
395 }
396
397 for postings in self.index.values_mut() {
399 postings.retain(|p| p.doc_id != doc_id);
400 }
401
402 self.index.retain(|_, v| !v.is_empty());
404
405 true
406 }
407
408 pub fn search(&self, query: &str, top_k: usize) -> Vec<FullTextResult> {
410 let query_tokens = self.analyzer.analyze(query);
411 if query_tokens.is_empty() {
412 return Vec::new();
413 }
414
415 let mut scores: HashMap<String, f32> = HashMap::new();
417
418 for token in &query_tokens {
419 if let Some(postings) = self.index.get(token) {
420 let idf = self.calculate_idf(postings.len());
421
422 for posting in postings {
423 let doc_length = self.doc_lengths.get(&posting.doc_id).copied().unwrap_or(0);
424 let tf_score = self.calculate_tf(posting.term_freq, doc_length);
425 let score = idf * tf_score;
426
427 *scores.entry(posting.doc_id.clone()).or_insert(0.0) += score;
428 }
429 }
430 }
431
432 let mut results: Vec<_> = scores
434 .into_iter()
435 .map(|(doc_id, score)| FullTextResult { doc_id, score })
436 .collect();
437
438 results.sort_by(|a, b| {
439 b.score
440 .partial_cmp(&a.score)
441 .unwrap_or(std::cmp::Ordering::Equal)
442 });
443 results.truncate(top_k);
444 results
445 }
446
447 fn calculate_idf(&self, doc_freq: usize) -> f32 {
449 let n = self.doc_count as f32;
450 let df = doc_freq as f32;
451 ((n - df + 0.5) / (df + 0.5) + 1.0).ln()
452 }
453
454 fn calculate_tf(&self, term_freq: u32, doc_length: u32) -> f32 {
456 let tf = term_freq as f32;
457 let dl = doc_length as f32;
458 let avgdl = self.avg_doc_length;
459 let k1 = self.config.k1;
460 let b = self.config.b;
461
462 let length_norm = 1.0 - b + b * (dl / avgdl);
463 (tf * (k1 + 1.0)) / (tf + k1 * length_norm)
464 }
465
466 pub fn stats(&self) -> FullTextStats {
468 FullTextStats {
469 document_count: self.doc_count as usize,
470 unique_terms: self.index.len(),
471 avg_document_length: self.avg_doc_length,
472 total_postings: self.index.values().map(|v| v.len()).sum(),
473 }
474 }
475
476 pub fn contains(&self, doc_id: &str) -> bool {
478 self.doc_lengths.contains_key(doc_id)
479 }
480
481 pub fn get_metadata(&self, doc_id: &str) -> Option<&serde_json::Value> {
483 self.doc_metadata.get(doc_id)
484 }
485
486 pub fn search_with_filter(
488 &self,
489 query: &str,
490 top_k: usize,
491 filter: Option<&common::FilterExpression>,
492 ) -> Vec<FullTextResult> {
493 let results = self.search(query, top_k * 2); if let Some(filter_expr) = filter {
498 use crate::filter::evaluate_filter;
499 results
500 .into_iter()
501 .filter(|r| evaluate_filter(filter_expr, self.doc_metadata.get(&r.doc_id)))
502 .take(top_k)
503 .collect()
504 } else {
505 results.into_iter().take(top_k).collect()
506 }
507 }
508
509 pub fn len(&self) -> usize {
511 self.doc_count as usize
512 }
513
514 pub fn is_empty(&self) -> bool {
516 self.doc_count == 0
517 }
518
519 pub fn clear(&mut self) {
521 self.index.clear();
522 self.doc_lengths.clear();
523 self.doc_count = 0;
524 self.avg_doc_length = 0.0;
525 }
526}
527
528impl Default for InvertedIndex {
529 fn default() -> Self {
530 Self::new(FullTextConfig::default())
531 }
532}
533
534#[derive(Debug, Clone)]
536pub struct FullTextResult {
537 pub doc_id: String,
539 pub score: f32,
541}
542
543#[derive(Debug, Clone)]
545pub struct FullTextStats {
546 pub document_count: usize,
548 pub unique_terms: usize,
550 pub avg_document_length: f32,
552 pub total_postings: usize,
554}
555
556#[cfg(test)]
557mod tests {
558 use super::*;
559
560 #[test]
561 fn test_text_analyzer_basic() {
562 let analyzer = TextAnalyzer::default();
563 let tokens = analyzer.analyze("Hello World! This is a test.");
564
565 assert!(tokens.contains(&"hello".to_string()));
567 assert!(tokens.contains(&"world".to_string()));
568 assert!(tokens.contains(&"test".to_string()));
569 assert!(!tokens.contains(&"a".to_string())); }
571
572 #[test]
573 fn test_text_analyzer_case_insensitive() {
574 let analyzer = TextAnalyzer::default();
575 let tokens = analyzer.analyze("HELLO hello HeLLo");
576
577 assert_eq!(tokens.iter().filter(|t| *t == "hello").count(), 3);
578 }
579
580 #[test]
581 fn test_text_analyzer_token_length() {
582 let mut config = FullTextConfig::default();
583 config.min_token_length = 3;
584 config.max_token_length = 5;
585 let analyzer = TextAnalyzer::new(config);
586
587 let tokens = analyzer.analyze("a ab abc abcd abcde abcdef");
588
589 assert!(!tokens.contains(&"a".to_string()));
590 assert!(!tokens.contains(&"ab".to_string()));
591 assert!(tokens.contains(&"abc".to_string()));
592 assert!(tokens.contains(&"abcd".to_string()));
593 assert!(tokens.contains(&"abcde".to_string()));
594 assert!(!tokens.contains(&"abcdef".to_string()));
595 }
596
597 #[test]
598 fn test_token_frequencies() {
599 let analyzer = TextAnalyzer::default();
600 let freqs = analyzer.token_frequencies("hello hello world hello");
601
602 assert_eq!(freqs.get("hello"), Some(&3));
603 assert_eq!(freqs.get("world"), Some(&1));
604 }
605
606 #[test]
607 fn test_inverted_index_add_and_search() {
608 let mut index = InvertedIndex::default();
609
610 index.add_document("doc1", "The quick brown fox jumps over the lazy dog");
611 index.add_document("doc2", "A quick brown dog runs in the park");
612 index.add_document("doc3", "The lazy cat sleeps all day");
613
614 let results = index.search("quick brown", 10);
615
616 assert!(!results.is_empty());
618 assert!(results.iter().any(|r| r.doc_id == "doc1"));
619 assert!(results.iter().any(|r| r.doc_id == "doc2"));
620 assert!(!results.iter().any(|r| r.doc_id == "doc3"));
622 }
623
624 #[test]
625 fn test_inverted_index_ranking() {
626 let mut index = InvertedIndex::default();
627
628 index.add_document("doc1", "rust is awesome, rust programming");
630 index.add_document("doc2", "rust programming language");
632 index.add_document("doc3", "python programming language");
634
635 let results = index.search("rust", 10);
636
637 assert_eq!(results.len(), 2);
638 assert_eq!(results[0].doc_id, "doc1");
640 assert_eq!(results[1].doc_id, "doc2");
641 assert!(results[0].score > results[1].score);
642 }
643
644 #[test]
645 fn test_inverted_index_remove() {
646 let mut index = InvertedIndex::default();
647
648 index.add_document("doc1", "hello world");
649 index.add_document("doc2", "hello universe");
650
651 assert_eq!(index.len(), 2);
652
653 let removed = index.remove_document("doc1");
654 assert!(removed);
655 assert_eq!(index.len(), 1);
656
657 let results = index.search("hello", 10);
658 assert_eq!(results.len(), 1);
659 assert_eq!(results[0].doc_id, "doc2");
660 }
661
662 #[test]
663 fn test_inverted_index_update() {
664 let mut index = InvertedIndex::default();
665
666 index.add_document("doc1", "original content about cats");
667 let results1 = index.search("cats", 10);
668 assert_eq!(results1.len(), 1);
669
670 index.add_document("doc1", "updated content about dogs");
672
673 let results2 = index.search("cats", 10);
674 assert_eq!(results2.len(), 0);
675
676 let results3 = index.search("dogs", 10);
677 assert_eq!(results3.len(), 1);
678 }
679
680 #[test]
681 fn test_inverted_index_empty_query() {
682 let mut index = InvertedIndex::default();
683 index.add_document("doc1", "hello world");
684
685 let results = index.search("the is a", 10);
687 assert!(results.is_empty());
688 }
689
690 #[test]
691 fn test_inverted_index_stats() {
692 let mut index = InvertedIndex::default();
693
694 index.add_document("doc1", "hello world test");
695 index.add_document("doc2", "hello universe example");
696
697 let stats = index.stats();
698
699 assert_eq!(stats.document_count, 2);
700 assert!(stats.unique_terms > 0);
701 assert!(stats.avg_document_length > 0.0);
702 assert!(stats.total_postings > 0);
703 }
704
705 #[test]
706 fn test_inverted_index_clear() {
707 let mut index = InvertedIndex::default();
708
709 index.add_document("doc1", "hello world");
710 index.add_document("doc2", "hello universe");
711
712 assert_eq!(index.len(), 2);
713
714 index.clear();
715
716 assert_eq!(index.len(), 0);
717 assert!(index.is_empty());
718 assert_eq!(index.stats().unique_terms, 0);
719 }
720
721 #[test]
722 fn test_bm25_idf() {
723 let _index = InvertedIndex::default();
724
725 }
729
730 #[test]
731 fn test_bm25_length_normalization() {
732 let mut index = InvertedIndex::default();
733
734 index.add_document("short", "rust");
736 index.add_document(
738 "long",
739 "rust programming language framework library ecosystem tools community",
740 );
741
742 let results = index.search("rust", 10);
743
744 assert_eq!(results.len(), 2);
746 }
748
749 #[test]
750 fn test_contains() {
751 let mut index = InvertedIndex::default();
752
753 index.add_document("doc1", "hello world");
754
755 assert!(index.contains("doc1"));
756 assert!(!index.contains("doc2"));
757 }
758
759 #[test]
760 fn test_custom_config() {
761 let config = FullTextConfig {
762 k1: 1.5,
763 b: 0.5,
764 min_token_length: 1,
765 max_token_length: 100,
766 lowercase: false,
767 stop_words: HashSet::new(),
768 stem_language: StemLanguage::None,
769 };
770
771 let mut index = InvertedIndex::new(config);
772 index.add_document("doc1", "A B C");
773
774 let results = index.search("A", 10);
776 assert_eq!(results.len(), 1);
777 }
778
779 #[test]
780 fn test_special_characters() {
781 let mut index = InvertedIndex::default();
782
783 index.add_document("doc1", "hello@world.com test-case under_score");
784
785 let results = index.search("hello", 10);
787 assert_eq!(results.len(), 1);
788
789 let results = index.search("world", 10);
790 assert_eq!(results.len(), 1);
791
792 let results = index.search("test", 10);
793 assert_eq!(results.len(), 1);
794 }
795
796 #[test]
797 fn test_numeric_tokens() {
798 let mut index = InvertedIndex::default();
799
800 index.add_document("doc1", "version 123 release 2024");
801
802 let results = index.search("123", 10);
803 assert_eq!(results.len(), 1);
804
805 let results = index.search("2024", 10);
806 assert_eq!(results.len(), 1);
807 }
808
809 #[test]
810 fn test_phrase_search_basic() {
811 let mut index = InvertedIndex::default();
812
813 index.add_document("doc1", "quick brown fox");
814 index.add_document("doc2", "brown quick fox");
815
816 let results = index.search("quick brown", 10);
818 assert_eq!(results.len(), 2);
819 }
820
821 #[test]
826 fn test_english_stemming() {
827 let config = FullTextConfig::with_english_stemming();
828 let analyzer = TextAnalyzer::new(config);
829
830 let tokens = analyzer.analyze("The cats were running and jumping");
832
833 assert!(tokens.contains(&"cat".to_string())); assert!(tokens.contains(&"run".to_string())); assert!(tokens.contains(&"jump".to_string())); }
837
838 #[test]
839 fn test_english_stemming_search() {
840 let config = FullTextConfig::with_english_stemming();
841 let mut index = InvertedIndex::new(config);
842
843 index.add_document("doc1", "The programmer is programming applications");
844 index.add_document("doc2", "Software development requires developers");
845 index.add_document("doc3", "Cooking recipes for beginners");
846
847 let results = index.search("program", 10);
849 assert_eq!(results.len(), 1);
850 assert_eq!(results[0].doc_id, "doc1");
851
852 let results = index.search("develop", 10);
854 assert_eq!(results.len(), 1);
855 assert_eq!(results[0].doc_id, "doc2");
856
857 let results = index.search("programming", 10);
859 assert_eq!(results.len(), 1);
860 assert_eq!(results[0].doc_id, "doc1");
861 }
862
863 #[test]
864 fn test_german_stemming() {
865 let config = FullTextConfig::with_language(StemLanguage::German);
866 let analyzer = TextAnalyzer::new(config);
867
868 let _tokens = analyzer.analyze("Die Entwickler entwickeln Software");
870
871 assert!(analyzer.stemming_enabled());
873 assert_eq!(analyzer.stem_language(), StemLanguage::German);
874 }
875
876 #[test]
877 fn test_french_stemming() {
878 let config = FullTextConfig::with_language(StemLanguage::French);
879 let analyzer = TextAnalyzer::new(config);
880
881 let tokens = analyzer.analyze("Les programmeurs programment des applications");
882
883 assert!(analyzer.stemming_enabled());
884 assert_eq!(analyzer.stem_language(), StemLanguage::French);
885 assert!(!tokens.is_empty());
886 }
887
888 #[test]
889 fn test_spanish_stemming() {
890 let config = FullTextConfig::with_language(StemLanguage::Spanish);
891 let analyzer = TextAnalyzer::new(config);
892
893 let tokens = analyzer.analyze("Los desarrolladores desarrollan aplicaciones");
894
895 assert!(analyzer.stemming_enabled());
896 assert_eq!(analyzer.stem_language(), StemLanguage::Spanish);
897 assert!(!tokens.is_empty());
898 }
899
900 #[test]
901 fn test_no_stemming_default() {
902 let analyzer = TextAnalyzer::default();
903
904 assert!(!analyzer.stemming_enabled());
906 assert_eq!(analyzer.stem_language(), StemLanguage::None);
907
908 let tokens = analyzer.analyze("running jumped cats");
909
910 assert!(tokens.contains(&"running".to_string()));
912 assert!(tokens.contains(&"jumped".to_string()));
913 assert!(tokens.contains(&"cats".to_string()));
914 }
915
916 #[test]
917 fn test_stem_language_from_str() {
918 assert_eq!(
919 StemLanguage::parse_str("english"),
920 Some(StemLanguage::English)
921 );
922 assert_eq!(StemLanguage::parse_str("en"), Some(StemLanguage::English));
923 assert_eq!(
924 StemLanguage::parse_str("ENGLISH"),
925 Some(StemLanguage::English)
926 );
927 assert_eq!(
928 StemLanguage::parse_str("french"),
929 Some(StemLanguage::French)
930 );
931 assert_eq!(StemLanguage::parse_str("fr"), Some(StemLanguage::French));
932 assert_eq!(
933 StemLanguage::parse_str("german"),
934 Some(StemLanguage::German)
935 );
936 assert_eq!(StemLanguage::parse_str("de"), Some(StemLanguage::German));
937 assert_eq!(
938 StemLanguage::parse_str("spanish"),
939 Some(StemLanguage::Spanish)
940 );
941 assert_eq!(StemLanguage::parse_str("es"), Some(StemLanguage::Spanish));
942 assert_eq!(StemLanguage::parse_str("none"), Some(StemLanguage::None));
943 assert_eq!(StemLanguage::parse_str(""), Some(StemLanguage::None));
944 assert_eq!(StemLanguage::parse_str("invalid"), None);
945 }
946
947 #[test]
948 fn test_supported_languages() {
949 let languages = StemLanguage::supported_languages();
950 assert!(languages.contains(&"english"));
951 assert!(languages.contains(&"french"));
952 assert!(languages.contains(&"german"));
953 assert!(languages.contains(&"spanish"));
954 assert!(languages.contains(&"italian"));
955 assert!(languages.contains(&"portuguese"));
956 assert!(languages.contains(&"russian"));
957 assert!(languages.contains(&"arabic"));
958 assert_eq!(languages.len(), 18); }
960}