nodedb_fts/analyzer/language/
stemmer.rs1use rust_stemmers::{Algorithm, Stemmer};
6
7use crate::analyzer::pipeline::{TextAnalyzer, tokenize_with_stemmer};
8
9use super::stop_words;
10
11pub struct LanguageAnalyzer {
13 algorithm: Algorithm,
14 lang_code: String,
15 lang_name: String,
16}
17
18impl LanguageAnalyzer {
19 pub fn new(language: &str) -> Option<Self> {
20 let lower = language.to_lowercase();
21 let (algorithm, code) = match lower.as_str() {
22 "english" | "en" => (Algorithm::English, "en"),
23 "german" | "de" => (Algorithm::German, "de"),
24 "french" | "fr" => (Algorithm::French, "fr"),
25 "spanish" | "es" => (Algorithm::Spanish, "es"),
26 "italian" | "it" => (Algorithm::Italian, "it"),
27 "portuguese" | "pt" => (Algorithm::Portuguese, "pt"),
28 "dutch" | "nl" => (Algorithm::Dutch, "nl"),
29 "swedish" | "sv" => (Algorithm::Swedish, "sv"),
30 "norwegian" | "no" => (Algorithm::Norwegian, "no"),
31 "danish" | "da" => (Algorithm::Danish, "da"),
32 "finnish" | "fi" => (Algorithm::Finnish, "fi"),
33 "russian" | "ru" => (Algorithm::Russian, "ru"),
34 "turkish" | "tr" => (Algorithm::Turkish, "tr"),
35 "hungarian" | "hu" => (Algorithm::Hungarian, "hu"),
36 "romanian" | "ro" => (Algorithm::Romanian, "ro"),
37 "arabic" | "ar" => (Algorithm::Arabic, "ar"),
38 _ => return None,
39 };
40 Some(Self {
41 algorithm,
42 lang_code: code.to_string(),
43 lang_name: lower,
44 })
45 }
46
47 pub fn lang_code(&self) -> &str {
49 &self.lang_code
50 }
51}
52
53impl TextAnalyzer for LanguageAnalyzer {
54 fn analyze(&self, text: &str) -> Vec<String> {
55 let stemmer = Stemmer::create(self.algorithm);
56 let stop_list = stop_words::stop_words(&self.lang_code);
57 tokenize_with_stemmer(text, &stemmer, &self.lang_code, stop_list)
58 }
59
60 fn name(&self) -> &str {
61 &self.lang_name
62 }
63}
64
65pub struct NoStemAnalyzer {
69 lang_code: String,
70 lang_name: String,
71}
72
73impl NoStemAnalyzer {
74 pub fn new(language: &str) -> Option<Self> {
75 let lower = language.to_lowercase();
76 let code = match lower.as_str() {
77 "hindi" | "hi" => "hi",
78 "hebrew" | "he" => "he",
79 "thai" | "th" => "th",
80 "vietnamese" | "vi" => "vi",
81 "indonesian" | "id" => "id",
82 "chinese" | "zh" => "zh",
83 "japanese" | "ja" => "ja",
84 "korean" | "ko" => "ko",
85 "czech" | "cs" => "cs",
86 "polish" | "pl" => "pl",
87 "greek" | "el" => "el",
88 _ => return None,
89 };
90 Some(Self {
91 lang_code: code.to_string(),
92 lang_name: lower,
93 })
94 }
95}
96
97impl TextAnalyzer for NoStemAnalyzer {
98 fn analyze(&self, text: &str) -> Vec<String> {
99 let stop_list = stop_words::stop_words(&self.lang_code);
100 let stemmer = Stemmer::create(Algorithm::English);
103 tokenize_with_stemmer(text, &stemmer, &self.lang_code, stop_list)
104 }
105
106 fn name(&self) -> &str {
107 &self.lang_name
108 }
109}
110
111#[cfg(test)]
112mod tests {
113 use super::*;
114
115 #[test]
116 fn german_uses_german_stop_words() {
117 let analyzer = LanguageAnalyzer::new("german").unwrap();
118 let tokens = analyzer.analyze("Die Datenbanken sind schnell");
119 assert!(!tokens.iter().any(|t| t == "die" || t == "sind"));
121 assert!(!tokens.is_empty());
122 }
123
124 #[test]
125 fn german_does_not_use_english_stop_words() {
126 let analyzer = LanguageAnalyzer::new("german").unwrap();
127 let tokens = analyzer.analyze("the Datenbank");
129 assert!(tokens.iter().any(|t| t == "the"));
130 }
131
132 #[test]
133 fn french_stop_words() {
134 let analyzer = LanguageAnalyzer::new("french").unwrap();
135 let tokens = analyzer.analyze("le chat est sur la table");
136 assert!(!tokens.iter().any(|t| t == "le" || t == "la" || t == "sur"));
138 }
139
140 #[test]
141 fn arabic_analyzer() {
142 let analyzer = LanguageAnalyzer::new("arabic").unwrap();
143 let tokens = analyzer.analyze("في المدينة الكبيرة");
144 assert!(!tokens.iter().any(|t| t == "في"));
146 }
147
148 #[test]
149 fn unknown_language_returns_none() {
150 assert!(LanguageAnalyzer::new("klingon").is_none());
151 }
152
153 #[test]
154 fn no_stem_hindi() {
155 let analyzer = NoStemAnalyzer::new("hindi").unwrap();
156 let tokens = analyzer.analyze("यह एक परीक्षा है");
157 assert!(!tokens.iter().any(|t| t == "यह" || t == "है"));
159 }
160}