Skip to main content

nodedb_fts/analyzer/language/
stemmer.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! Language-specific analyzer: Snowball stemming with per-language stop words.
4
5use rust_stemmers::{Algorithm, Stemmer};
6
7use crate::analyzer::pipeline::{TextAnalyzer, tokenize_with_stemmer};
8
9use super::stop_words;
10
11/// Language-specific analyzer using Snowball stemming and per-language stop words.
12pub struct LanguageAnalyzer {
13    algorithm: Algorithm,
14    lang_code: String,
15    lang_name: String,
16}
17
18impl LanguageAnalyzer {
19    pub fn new(language: &str) -> Option<Self> {
20        let lower = language.to_lowercase();
21        let (algorithm, code) = match lower.as_str() {
22            "english" | "en" => (Algorithm::English, "en"),
23            "german" | "de" => (Algorithm::German, "de"),
24            "french" | "fr" => (Algorithm::French, "fr"),
25            "spanish" | "es" => (Algorithm::Spanish, "es"),
26            "italian" | "it" => (Algorithm::Italian, "it"),
27            "portuguese" | "pt" => (Algorithm::Portuguese, "pt"),
28            "dutch" | "nl" => (Algorithm::Dutch, "nl"),
29            "swedish" | "sv" => (Algorithm::Swedish, "sv"),
30            "norwegian" | "no" => (Algorithm::Norwegian, "no"),
31            "danish" | "da" => (Algorithm::Danish, "da"),
32            "finnish" | "fi" => (Algorithm::Finnish, "fi"),
33            "russian" | "ru" => (Algorithm::Russian, "ru"),
34            "turkish" | "tr" => (Algorithm::Turkish, "tr"),
35            "hungarian" | "hu" => (Algorithm::Hungarian, "hu"),
36            "romanian" | "ro" => (Algorithm::Romanian, "ro"),
37            "arabic" | "ar" => (Algorithm::Arabic, "ar"),
38            _ => return None,
39        };
40        Some(Self {
41            algorithm,
42            lang_code: code.to_string(),
43            lang_name: lower,
44        })
45    }
46
47    /// Language code (ISO 639-1).
48    pub fn lang_code(&self) -> &str {
49        &self.lang_code
50    }
51}
52
53impl TextAnalyzer for LanguageAnalyzer {
54    fn analyze(&self, text: &str) -> Vec<String> {
55        let stemmer = Stemmer::create(self.algorithm);
56        let stop_list = stop_words::stop_words(&self.lang_code);
57        tokenize_with_stemmer(text, &stemmer, &self.lang_code, stop_list)
58    }
59
60    fn name(&self) -> &str {
61        &self.lang_name
62    }
63}
64
65/// No-stemmer analyzer for languages without Snowball support (Hindi, etc.).
66///
67/// Applies the full pipeline (normalize, split, stop words) but skips stemming.
68pub struct NoStemAnalyzer {
69    lang_code: String,
70    lang_name: String,
71}
72
73impl NoStemAnalyzer {
74    pub fn new(language: &str) -> Option<Self> {
75        let lower = language.to_lowercase();
76        let code = match lower.as_str() {
77            "hindi" | "hi" => "hi",
78            "hebrew" | "he" => "he",
79            "thai" | "th" => "th",
80            "vietnamese" | "vi" => "vi",
81            "indonesian" | "id" => "id",
82            "chinese" | "zh" => "zh",
83            "japanese" | "ja" => "ja",
84            "korean" | "ko" => "ko",
85            "czech" | "cs" => "cs",
86            "polish" | "pl" => "pl",
87            "greek" | "el" => "el",
88            _ => return None,
89        };
90        Some(Self {
91            lang_code: code.to_string(),
92            lang_name: lower,
93        })
94    }
95}
96
97impl TextAnalyzer for NoStemAnalyzer {
98    fn analyze(&self, text: &str) -> Vec<String> {
99        let stop_list = stop_words::stop_words(&self.lang_code);
100        // Use English stemmer as no-op: it won't affect non-English words meaningfully.
101        // The stop word list does the language-specific work.
102        let stemmer = Stemmer::create(Algorithm::English);
103        tokenize_with_stemmer(text, &stemmer, &self.lang_code, stop_list)
104    }
105
106    fn name(&self) -> &str {
107        &self.lang_name
108    }
109}
110
111#[cfg(test)]
112mod tests {
113    use super::*;
114
115    #[test]
116    fn german_uses_german_stop_words() {
117        let analyzer = LanguageAnalyzer::new("german").unwrap();
118        let tokens = analyzer.analyze("Die Datenbanken sind schnell");
119        // "die" and "sind" are German stop words — should be removed.
120        assert!(!tokens.iter().any(|t| t == "die" || t == "sind"));
121        assert!(!tokens.is_empty());
122    }
123
124    #[test]
125    fn german_does_not_use_english_stop_words() {
126        let analyzer = LanguageAnalyzer::new("german").unwrap();
127        // "the" is an English stop word but NOT a German one — should pass through.
128        let tokens = analyzer.analyze("the Datenbank");
129        assert!(tokens.iter().any(|t| t == "the"));
130    }
131
132    #[test]
133    fn french_stop_words() {
134        let analyzer = LanguageAnalyzer::new("french").unwrap();
135        let tokens = analyzer.analyze("le chat est sur la table");
136        // "le", "est", "sur", "la" are French stop words.
137        assert!(!tokens.iter().any(|t| t == "le" || t == "la" || t == "sur"));
138    }
139
140    #[test]
141    fn arabic_analyzer() {
142        let analyzer = LanguageAnalyzer::new("arabic").unwrap();
143        let tokens = analyzer.analyze("في المدينة الكبيرة");
144        // "في" is Arabic stop word.
145        assert!(!tokens.iter().any(|t| t == "في"));
146    }
147
148    #[test]
149    fn unknown_language_returns_none() {
150        assert!(LanguageAnalyzer::new("klingon").is_none());
151    }
152
153    #[test]
154    fn no_stem_hindi() {
155        let analyzer = NoStemAnalyzer::new("hindi").unwrap();
156        let tokens = analyzer.analyze("यह एक परीक्षा है");
157        // "यह" and "है" are Hindi stop words.
158        assert!(!tokens.iter().any(|t| t == "यह" || t == "है"));
159    }
160}