Skip to main content

qdrant_rust_stemmers/
lib.rs

1//! This library provides rust implementations for some stemmer algorithms
2//! written in the [snowball language](https://snowballstem.org/).
3//!
4//!
5//! All algorithms expect the input to already be lowercased.
6//!
7//! # Usage
8//! ```toml
9//! [dependencies]
10//! rust-stemmers = "^1.0"
11//! ```
12//!
13//! ```rust
14//! extern crate qdrant_rust_stemmers;
15//!
16//! use qdrant_rust_stemmers::{Algorithm, Stemmer};
17//!
18//! fn main() {
19//!    let en_stemmer = Stemmer::create(Algorithm::English);
20//!    assert_eq!(en_stemmer.stem("fruitlessly"), "fruitless");
21//! }
22//! ```
23extern crate serde;
24#[macro_use]
25extern crate serde_derive;
26
27use std::borrow::Cow;
28
29mod snowball;
30
31use snowball::algorithms;
32use snowball::SnowballEnv;
33
34/// Enum of all supported algorithms.
35/// Check the [Snowball-Website](https://snowballstem.org/) for details.
36#[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
37pub enum Algorithm {
38    Arabic,
39    Armenian,
40    Danish,
41    Dutch,
42    English,
43    Finnish,
44    French,
45    German,
46    Greek,
47    Hungarian,
48    Italian,
49    Norwegian,
50    Portuguese,
51    Romanian,
52    Russian,
53    Spanish,
54    Swedish,
55    Tamil,
56    Turkish,
57}
58
59/// Wrapps a usable interface around the actual stemmer implementation
60pub struct Stemmer {
61    stemmer: fn(&mut SnowballEnv) -> bool,
62}
63
64impl Stemmer {
65    /// Create a new stemmer from an algorithm
66    pub fn create(lang: Algorithm) -> Self {
67        match lang {
68            Algorithm::Arabic => Stemmer {
69                stemmer: algorithms::arabic::stem,
70            },
71            Algorithm::Armenian => Stemmer {
72                stemmer: algorithms::armenian::stem,
73            },
74            Algorithm::Danish => Stemmer {
75                stemmer: algorithms::danish::stem,
76            },
77            Algorithm::Dutch => Stemmer {
78                stemmer: algorithms::dutch::stem,
79            },
80            Algorithm::English => Stemmer {
81                stemmer: algorithms::english::stem,
82            },
83            Algorithm::Finnish => Stemmer {
84                stemmer: algorithms::finnish::stem,
85            },
86            Algorithm::French => Stemmer {
87                stemmer: algorithms::french::stem,
88            },
89            Algorithm::German => Stemmer {
90                stemmer: algorithms::german::stem,
91            },
92            Algorithm::Greek => Stemmer {
93                stemmer: algorithms::greek::stem,
94            },
95            Algorithm::Hungarian => Stemmer {
96                stemmer: algorithms::hungarian::stem,
97            },
98            Algorithm::Italian => Stemmer {
99                stemmer: algorithms::italian::stem,
100            },
101            Algorithm::Norwegian => Stemmer {
102                stemmer: algorithms::norwegian::stem,
103            },
104            Algorithm::Portuguese => Stemmer {
105                stemmer: algorithms::portuguese::stem,
106            },
107            Algorithm::Romanian => Stemmer {
108                stemmer: algorithms::romanian::stem,
109            },
110            Algorithm::Russian => Stemmer {
111                stemmer: algorithms::russian::stem,
112            },
113            Algorithm::Spanish => Stemmer {
114                stemmer: algorithms::spanish::stem,
115            },
116            Algorithm::Swedish => Stemmer {
117                stemmer: algorithms::swedish::stem,
118            },
119            Algorithm::Tamil => Stemmer {
120                stemmer: algorithms::tamil::stem,
121            },
122            Algorithm::Turkish => Stemmer {
123                stemmer: algorithms::turkish::stem,
124            },
125        }
126    }
127
128    /// Stem a single word
129    /// Please note, that the input is expected to be all lowercase (if that is applicable).
130    pub fn stem<'a>(&self, input: &'a str) -> Cow<'a, str> {
131        let mut env = SnowballEnv::create(input);
132        (self.stemmer)(&mut env);
133        env.get_current()
134    }
135
136    /// Stem a single word
137    /// Please note, that the input is expected to be all lowercase (if that is applicable).
138    pub fn stem_cow<'a>(&self, input: Cow<'a, str>) -> Cow<'a, str> {
139        let mut env = SnowballEnv::create_cow(input);
140        (self.stemmer)(&mut env);
141        env.get_current()
142    }
143}
144
145#[cfg(test)]
146mod tests {
147    use super::{Algorithm, Stemmer};
148
149    fn stemms_to(lhs: &str, rhs: &str, stemmer: Algorithm) {
150        assert_eq!(Stemmer::create(stemmer).stem(lhs), rhs);
151    }
152
153    #[test]
154    fn german_test() {
155        use std::fs;
156        use std::io;
157        use std::io::BufRead;
158
159        let vocab = io::BufReader::new(fs::File::open("test_data/voc_ger.txt").unwrap());
160        let result = io::BufReader::new(fs::File::open("test_data/res_ger.txt").unwrap());
161
162        let lines = vocab.lines().zip(result.lines());
163
164        for (voc, res) in lines {
165            stemms_to(
166                voc.unwrap().as_str(),
167                res.unwrap().as_str(),
168                Algorithm::German,
169            );
170        }
171    }
172
173    #[test]
174    fn english_test() {
175        use std::fs;
176        use std::io;
177        use std::io::BufRead;
178
179        let vocab = io::BufReader::new(fs::File::open("test_data/voc_en.txt").unwrap());
180        let result = io::BufReader::new(fs::File::open("test_data/res_en.txt").unwrap());
181
182        let lines = vocab.lines().zip(result.lines());
183
184        for (voc, res) in lines {
185            stemms_to(
186                voc.unwrap().as_str(),
187                res.unwrap().as_str(),
188                Algorithm::English,
189            );
190        }
191    }
192
193    #[test]
194    fn french_test() {
195        use std::fs;
196        use std::io;
197        use std::io::BufRead;
198
199        let vocab = io::BufReader::new(fs::File::open("test_data/voc_fr.txt").unwrap());
200        let result = io::BufReader::new(fs::File::open("test_data/res_fr.txt").unwrap());
201
202        let lines = vocab.lines().zip(result.lines());
203
204        for (voc, res) in lines {
205            stemms_to(
206                voc.unwrap().as_str(),
207                res.unwrap().as_str(),
208                Algorithm::French,
209            );
210        }
211    }
212
213    #[test]
214    fn spanish_test() {
215        use std::fs;
216        use std::io;
217        use std::io::BufRead;
218
219        let vocab = io::BufReader::new(fs::File::open("test_data/voc_es.txt").unwrap());
220        let result = io::BufReader::new(fs::File::open("test_data/res_es.txt").unwrap());
221
222        let lines = vocab.lines().zip(result.lines());
223
224        for (voc, res) in lines {
225            stemms_to(
226                voc.unwrap().as_str(),
227                res.unwrap().as_str(),
228                Algorithm::Spanish,
229            );
230        }
231    }
232
233    #[test]
234    fn portuguese_test() {
235        use std::fs;
236        use std::io;
237        use std::io::BufRead;
238
239        let vocab = io::BufReader::new(fs::File::open("test_data/voc_pt.txt").unwrap());
240        let result = io::BufReader::new(fs::File::open("test_data/res_pt.txt").unwrap());
241
242        let lines = vocab.lines().zip(result.lines());
243
244        for (voc, res) in lines {
245            stemms_to(
246                voc.unwrap().as_str(),
247                res.unwrap().as_str(),
248                Algorithm::Portuguese,
249            );
250        }
251    }
252
253    #[test]
254    fn italian_test() {
255        use std::fs;
256        use std::io;
257        use std::io::BufRead;
258
259        let vocab = io::BufReader::new(fs::File::open("test_data/voc_it.txt").unwrap());
260        let result = io::BufReader::new(fs::File::open("test_data/res_it.txt").unwrap());
261
262        let lines = vocab.lines().zip(result.lines());
263
264        for (voc, res) in lines {
265            stemms_to(
266                voc.unwrap().as_str(),
267                res.unwrap().as_str(),
268                Algorithm::Italian,
269            );
270        }
271    }
272
273    #[test]
274    fn romanian_test() {
275        use std::fs;
276        use std::io;
277        use std::io::BufRead;
278
279        let vocab = io::BufReader::new(fs::File::open("test_data/voc_ro.txt").unwrap());
280        let result = io::BufReader::new(fs::File::open("test_data/res_ro.txt").unwrap());
281
282        let lines = vocab.lines().zip(result.lines());
283
284        for (voc, res) in lines {
285            stemms_to(
286                voc.unwrap().as_str(),
287                res.unwrap().as_str(),
288                Algorithm::Romanian,
289            );
290        }
291    }
292
293    #[test]
294    fn russian_test() {
295        use std::fs;
296        use std::io;
297        use std::io::BufRead;
298
299        let vocab = io::BufReader::new(fs::File::open("test_data/voc_ru.txt").unwrap());
300        let result = io::BufReader::new(fs::File::open("test_data/res_ru.txt").unwrap());
301
302        let lines = vocab.lines().zip(result.lines());
303
304        for (voc, res) in lines {
305            stemms_to(
306                voc.unwrap().as_str(),
307                res.unwrap().as_str(),
308                Algorithm::Russian,
309            );
310        }
311    }
312
313    #[test]
314    fn arabic_test() {
315        use std::fs;
316        use std::io;
317        use std::io::BufRead;
318
319        let vocab = io::BufReader::new(fs::File::open("test_data/voc_ar.txt").unwrap());
320        let result = io::BufReader::new(fs::File::open("test_data/res_ar.txt").unwrap());
321
322        let lines = vocab.lines().zip(result.lines());
323
324        for (voc, res) in lines {
325            stemms_to(
326                voc.unwrap().as_str(),
327                res.unwrap().as_str(),
328                Algorithm::Arabic,
329            );
330        }
331    }
332
333    #[test]
334    fn finnish_test() {
335        use std::fs;
336        use std::io;
337        use std::io::BufRead;
338
339        let vocab = io::BufReader::new(fs::File::open("test_data/voc_fi.txt").unwrap());
340        let result = io::BufReader::new(fs::File::open("test_data/res_fi.txt").unwrap());
341
342        let lines = vocab.lines().zip(result.lines());
343
344        for (voc, res) in lines {
345            stemms_to(
346                voc.unwrap().as_str(),
347                res.unwrap().as_str(),
348                Algorithm::Finnish,
349            );
350        }
351    }
352
353    #[test]
354    fn greek_test() {
355        use std::fs;
356        use std::io;
357        use std::io::BufRead;
358
359        let vocab = io::BufReader::new(fs::File::open("test_data/voc_el.txt").unwrap());
360        let result = io::BufReader::new(fs::File::open("test_data/res_el.txt").unwrap());
361
362        let lines = vocab.lines().zip(result.lines());
363
364        for (voc, res) in lines {
365            stemms_to(
366                voc.unwrap().as_str(),
367                res.unwrap().as_str(),
368                Algorithm::Greek,
369            );
370        }
371    }
372
373    #[test]
374    fn norwegian_test() {
375        use std::fs;
376        use std::io;
377        use std::io::BufRead;
378
379        let vocab = io::BufReader::new(fs::File::open("test_data/voc_no.txt").unwrap());
380        let result = io::BufReader::new(fs::File::open("test_data/res_no.txt").unwrap());
381
382        let lines = vocab.lines().zip(result.lines());
383
384        for (voc, res) in lines {
385            stemms_to(
386                voc.unwrap().as_str(),
387                res.unwrap().as_str(),
388                Algorithm::Norwegian,
389            );
390        }
391    }
392}