rust_stemmers/
lib.rs

1//! This library provides rust implementations for some stemmer algorithms
2//! written in the [snowball language](https://snowballstem.org/).
3//!
4//!
5//! All algorithms expect the input to already be lowercased.
6//!
7//! # Usage
8//! ```toml
9//! [dependencies]
10//! rust-stemmers = "^1.0"
11//! ```
12//!
13//! ```rust
14//! extern crate rust_stemmers;
15//!
16//! use rust_stemmers::{Algorithm, Stemmer};
17//!
18//! fn main() {
19//!    let en_stemmer = Stemmer::create(Algorithm::English);
20//!    assert_eq!(en_stemmer.stem("fruitlessly"), "fruitless");
21//! }
22//! ```
23extern crate serde;
24#[macro_use]
25extern crate serde_derive;
26
27use std::borrow::Cow;
28
29mod snowball;
30
31use snowball::SnowballEnv;
32use snowball::algorithms;
33
34/// Enum of all supported algorithms.
35/// Check the [Snowball-Website](https://snowballstem.org/) for details.
36#[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
37pub enum Algorithm {
38    Arabic,
39    Danish,
40    Dutch,
41    English,
42    Finnish,
43    French,
44    German,
45    Greek,
46    Hungarian,
47    Italian,
48    Norwegian,
49    Portuguese,
50    Romanian,
51    Russian,
52    Spanish,
53    Swedish,
54    Tamil,
55    Turkish
56}
57
58/// Wrapps a usable interface around the actual stemmer implementation
59pub struct Stemmer {
60    stemmer: fn(&mut SnowballEnv) -> bool,
61}
62
63impl Stemmer {
64    /// Create a new stemmer from an algorithm
65    pub fn create(lang: Algorithm) -> Self {
66        match lang {
67            Algorithm::Arabic => Stemmer { stemmer: algorithms::arabic::stem },
68            Algorithm::Danish => Stemmer { stemmer: algorithms::danish::stem },
69            Algorithm::Dutch => Stemmer { stemmer: algorithms::dutch::stem },
70            Algorithm::English => Stemmer { stemmer: algorithms::english::stem },
71            Algorithm::Finnish => Stemmer { stemmer: algorithms::finnish::stem },
72            Algorithm::French => Stemmer { stemmer: algorithms::french::stem },
73            Algorithm::German => Stemmer { stemmer: algorithms::german::stem },
74            Algorithm::Greek => Stemmer { stemmer: algorithms::greek::stem },
75            Algorithm::Hungarian => Stemmer { stemmer: algorithms::hungarian::stem },
76            Algorithm::Italian => Stemmer { stemmer: algorithms::italian::stem },
77            Algorithm::Norwegian => Stemmer { stemmer: algorithms::norwegian::stem },
78            Algorithm::Portuguese => Stemmer { stemmer: algorithms::portuguese::stem },
79            Algorithm::Romanian => Stemmer { stemmer: algorithms::romanian::stem },
80            Algorithm::Russian => Stemmer { stemmer: algorithms::russian::stem },
81            Algorithm::Spanish => Stemmer { stemmer: algorithms::spanish::stem },
82            Algorithm::Swedish => Stemmer { stemmer: algorithms::swedish::stem },
83            Algorithm::Tamil => Stemmer { stemmer: algorithms::tamil::stem },
84            Algorithm::Turkish => Stemmer { stemmer: algorithms::turkish::stem },
85        }
86    }
87
88    /// Stem a single word
89    /// Please note, that the input is expected to be all lowercase (if that is applicable).
90    pub fn stem<'a>(&self, input: &'a str) -> Cow<'a, str> {
91        let mut env = SnowballEnv::create(input);
92        (self.stemmer)(&mut env);
93        env.get_current()
94    }
95}
96
97
98
99#[cfg(test)]
100mod tests {
101    use super::{Stemmer, Algorithm};
102
103    fn stemms_to(lhs: &str, rhs: &str, stemmer: Algorithm) {
104        assert_eq!(Stemmer::create(stemmer).stem(lhs), rhs);
105    }
106
107    #[test]
108    fn german_test() {
109        use std::fs;
110        use std::io;
111        use std::io::BufRead;
112
113        let vocab = io::BufReader::new(fs::File::open("test_data/voc_ger.txt").unwrap());
114        let result = io::BufReader::new(fs::File::open("test_data/res_ger.txt").unwrap());
115
116        let lines = vocab.lines().zip(result.lines());
117
118        for (voc, res) in lines {
119            stemms_to(voc.unwrap().as_str(),
120                      res.unwrap().as_str(),
121                      Algorithm::German);
122        }
123    }
124
125    #[test]
126    fn english_test() {
127        use std::fs;
128        use std::io;
129        use std::io::BufRead;
130
131        let vocab = io::BufReader::new(fs::File::open("test_data/voc_en.txt").unwrap());
132        let result = io::BufReader::new(fs::File::open("test_data/res_en.txt").unwrap());
133
134        let lines = vocab.lines().zip(result.lines());
135
136        for (voc, res) in lines {
137            stemms_to(voc.unwrap().as_str(),
138                      res.unwrap().as_str(),
139                      Algorithm::English);
140        }
141    }
142
143    #[test]
144    fn french_test() {
145        use std::fs;
146        use std::io;
147        use std::io::BufRead;
148
149        let vocab = io::BufReader::new(fs::File::open("test_data/voc_fr.txt").unwrap());
150        let result = io::BufReader::new(fs::File::open("test_data/res_fr.txt").unwrap());
151
152        let lines = vocab.lines().zip(result.lines());
153
154        for (voc, res) in lines {
155            stemms_to(voc.unwrap().as_str(),
156                      res.unwrap().as_str(),
157                      Algorithm::French);
158        }
159    }
160
161    #[test]
162    fn spanish_test() {
163        use std::fs;
164        use std::io;
165        use std::io::BufRead;
166
167        let vocab = io::BufReader::new(fs::File::open("test_data/voc_es.txt").unwrap());
168        let result = io::BufReader::new(fs::File::open("test_data/res_es.txt").unwrap());
169
170        let lines = vocab.lines().zip(result.lines());
171
172        for (voc, res) in lines {
173            stemms_to(voc.unwrap().as_str(),
174                      res.unwrap().as_str(),
175                      Algorithm::Spanish);
176        }
177    }
178
179    #[test]
180    fn portuguese_test() {
181        use std::fs;
182        use std::io;
183        use std::io::BufRead;
184
185        let vocab = io::BufReader::new(fs::File::open("test_data/voc_pt.txt").unwrap());
186        let result = io::BufReader::new(fs::File::open("test_data/res_pt.txt").unwrap());
187
188        let lines = vocab.lines().zip(result.lines());
189
190        for (voc, res) in lines {
191            stemms_to(voc.unwrap().as_str(),
192                      res.unwrap().as_str(),
193                      Algorithm::Portuguese);
194        }
195    }
196
197    #[test]
198    fn italian_test() {
199        use std::fs;
200        use std::io;
201        use std::io::BufRead;
202
203        let vocab = io::BufReader::new(fs::File::open("test_data/voc_it.txt").unwrap());
204        let result = io::BufReader::new(fs::File::open("test_data/res_it.txt").unwrap());
205
206        let lines = vocab.lines().zip(result.lines());
207
208        for (voc, res) in lines {
209            stemms_to(voc.unwrap().as_str(),
210                      res.unwrap().as_str(),
211                      Algorithm::Italian);
212        }
213    }
214
215    #[test]
216    fn romanian_test() {
217        use std::fs;
218        use std::io;
219        use std::io::BufRead;
220
221        let vocab = io::BufReader::new(fs::File::open("test_data/voc_ro.txt").unwrap());
222        let result = io::BufReader::new(fs::File::open("test_data/res_ro.txt").unwrap());
223
224        let lines = vocab.lines().zip(result.lines());
225
226        for (voc, res) in lines {
227            stemms_to(voc.unwrap().as_str(),
228                      res.unwrap().as_str(),
229                      Algorithm::Romanian);
230        }
231    }
232
233    #[test]
234    fn russian_test() {
235        use std::fs;
236        use std::io;
237        use std::io::BufRead;
238
239        let vocab = io::BufReader::new(fs::File::open("test_data/voc_ru.txt").unwrap());
240        let result = io::BufReader::new(fs::File::open("test_data/res_ru.txt").unwrap());
241
242        let lines = vocab.lines().zip(result.lines());
243
244        for (voc, res) in lines {
245            stemms_to(voc.unwrap().as_str(),
246                      res.unwrap().as_str(),
247                      Algorithm::Russian);
248        }
249    }
250
251    #[test]
252    fn arabic_test() {
253        use std::fs;
254        use std::io;
255        use std::io::BufRead;
256
257        let vocab = io::BufReader::new(fs::File::open("test_data/voc_ar.txt").unwrap());
258        let result = io::BufReader::new(fs::File::open("test_data/res_ar.txt").unwrap());
259
260        let lines = vocab.lines().zip(result.lines());
261
262        for (voc, res) in lines {
263            stemms_to(voc.unwrap().as_str(),
264                      res.unwrap().as_str(),
265                      Algorithm::Arabic);
266        }
267    }
268
269    #[test]
270    fn finnish_test() {
271        use std::fs;
272        use std::io;
273        use std::io::BufRead;
274
275        let vocab = io::BufReader::new(fs::File::open("test_data/voc_fi.txt").unwrap());
276        let result = io::BufReader::new(fs::File::open("test_data/res_fi.txt").unwrap());
277
278        let lines = vocab.lines().zip(result.lines());
279
280        for (voc, res) in lines {
281            stemms_to(voc.unwrap().as_str(),
282                      res.unwrap().as_str(),
283                      Algorithm::Finnish);
284        }
285    }
286
287    #[test]
288    fn greek_test() {
289        use std::fs;
290        use std::io;
291        use std::io::BufRead;
292
293        let vocab = io::BufReader::new(fs::File::open("test_data/voc_el.txt").unwrap());
294        let result = io::BufReader::new(fs::File::open("test_data/res_el.txt").unwrap());
295
296        let lines = vocab.lines().zip(result.lines());
297
298        for (voc, res) in lines {
299            stemms_to(voc.unwrap().as_str(),
300                      res.unwrap().as_str(),
301                      Algorithm::Greek);
302        }
303    }
304
305    #[test]
306    fn norwegian_test() {
307        use std::fs;
308        use std::io;
309        use std::io::BufRead;
310
311        let vocab = io::BufReader::new(fs::File::open("test_data/voc_no.txt").unwrap());
312        let result = io::BufReader::new(fs::File::open("test_data/res_no.txt").unwrap());
313
314        let lines = vocab.lines().zip(result.lines());
315
316        for (voc, res) in lines {
317            stemms_to(voc.unwrap().as_str(),
318                      res.unwrap().as_str(),
319                      Algorithm::Norwegian);
320        }
321    }
322
323}