custom_rust_stemmers/
lib.rs

1//! This library provides rust implementations for some stemmer algorithms
2//! written in the [snowball language](https://snowballstem.org/).
3//!
4//!
5//! All algorithms expect the input to already be lowercased.
6//!
7//! # Usage
8//! ```toml
9//! [dependencies]
10//! rust-stemmers = "0.1"
11//! ```
12//!
13//! ```rust
14//! extern crate rust_stemmers;
15//!
16//! use rust_stemmers::{Algorithm, Stemmer};
17//!
18//! fn main() {
19//!    let en_stemmer = Stemmer::create(Algorithm::English);
20//!    assert_eq!(en_stemmer.stem("fruitlessly"), "fruitless");
21//! }
22//! ```
23
24
25use std::borrow::Cow;
26
27mod snowball;
28
29use snowball::SnowballEnv;
30use snowball::algorithms;
31
32/// Enum of all supported algorithms.
33/// Check the [Snowball-Website](https://snowballstem.org/) for details.
34pub enum Algorithm {
35    Arabic,
36    English,
37    French,
38    German,
39    Italian,
40    Portuguese,
41    Romanian,
42    Russian,
43    Spanish,
44}
45
46/// Wrapps a usable interface around the actual stemmer implementation
47pub struct Stemmer {
48    stemmer: Box<Fn(&mut SnowballEnv) -> bool>,
49}
50
51impl Stemmer {
52    /// Create a new stemmer from an algorithm
53    pub fn create(lang: Algorithm) -> Self {
54        match lang {
55            Algorithm::Arabic => Stemmer { stemmer: Box::new(algorithms::arabic::_stem) },
56            Algorithm::English => Stemmer { stemmer: Box::new(algorithms::english::_stem) },
57            Algorithm::French => Stemmer { stemmer: Box::new(algorithms::french::_stem) },
58            Algorithm::German => Stemmer { stemmer: Box::new(algorithms::german::_stem) },
59            Algorithm::Italian => Stemmer { stemmer: Box::new(algorithms::italian::_stem) },
60            Algorithm::Portuguese => Stemmer { stemmer: Box::new(algorithms::portuguese::_stem) },
61            Algorithm::Romanian => Stemmer { stemmer: Box::new(algorithms::romanian::_stem) },
62            Algorithm::Russian => Stemmer { stemmer: Box::new(algorithms::russian::_stem) },
63            Algorithm::Spanish => Stemmer { stemmer: Box::new(algorithms::spanish::_stem) },
64        }
65    }
66
67    /// Stem a single word
68    /// Please note, that the input is expected to be all lowercase (if that is applicable).
69    pub fn stem<'a>(&self, input: &'a str) -> Cow<'a, str> {
70        let mut env = SnowballEnv::create(input);
71        (self.stemmer)(&mut env);
72        env.get_current()
73    }
74}
75
76
77
78#[cfg(test)]
79mod tests {
80    use super::{Stemmer, Algorithm};
81
82    fn stemms_to(lhs: &str, rhs: &str, stemmer: Algorithm) {
83        assert_eq!(Stemmer::create(stemmer).stem(lhs), rhs);
84    }
85
86    #[test]
87    fn german_test() {
88        use std::fs;
89        use std::io;
90        use std::io::BufRead;
91
92        let vocab = io::BufReader::new(fs::File::open("test_data/voc_ger.txt").unwrap());
93        let result = io::BufReader::new(fs::File::open("test_data/res_ger.txt").unwrap());
94
95        let lines = vocab.lines().zip(result.lines());
96
97        for (voc, res) in lines {
98            stemms_to(voc.unwrap().as_str(),
99                      res.unwrap().as_str(),
100                      Algorithm::German);
101        }
102    }
103
104    #[test]
105    fn english_test() {
106        use std::fs;
107        use std::io;
108        use std::io::BufRead;
109
110        let vocab = io::BufReader::new(fs::File::open("test_data/voc_en.txt").unwrap());
111        let result = io::BufReader::new(fs::File::open("test_data/res_en.txt").unwrap());
112
113        let lines = vocab.lines().zip(result.lines());
114
115        for (voc, res) in lines {
116            stemms_to(voc.unwrap().as_str(),
117                      res.unwrap().as_str(),
118                      Algorithm::English);
119        }
120    }
121
122    #[test]
123    fn french_test() {
124        use std::fs;
125        use std::io;
126        use std::io::BufRead;
127
128        let vocab = io::BufReader::new(fs::File::open("test_data/voc_fr.txt").unwrap());
129        let result = io::BufReader::new(fs::File::open("test_data/res_fr.txt").unwrap());
130
131        let lines = vocab.lines().zip(result.lines());
132
133        for (voc, res) in lines {
134            stemms_to(voc.unwrap().as_str(),
135                      res.unwrap().as_str(),
136                      Algorithm::French);
137        }
138    }
139
140    #[test]
141    fn spanish_test() {
142        use std::fs;
143        use std::io;
144        use std::io::BufRead;
145
146        let vocab = io::BufReader::new(fs::File::open("test_data/voc_es.txt").unwrap());
147        let result = io::BufReader::new(fs::File::open("test_data/res_es.txt").unwrap());
148
149        let lines = vocab.lines().zip(result.lines());
150
151        for (voc, res) in lines {
152            stemms_to(voc.unwrap().as_str(),
153                      res.unwrap().as_str(),
154                      Algorithm::Spanish);
155        }
156    }
157
158    #[test]
159    fn portuguese_test() {
160        use std::fs;
161        use std::io;
162        use std::io::BufRead;
163
164        let vocab = io::BufReader::new(fs::File::open("test_data/voc_pt.txt").unwrap());
165        let result = io::BufReader::new(fs::File::open("test_data/res_pt.txt").unwrap());
166
167        let lines = vocab.lines().zip(result.lines());
168
169        for (voc, res) in lines {
170            stemms_to(voc.unwrap().as_str(),
171                      res.unwrap().as_str(),
172                      Algorithm::Portuguese);
173        }
174    }
175
176    #[test]
177    fn italian_test() {
178        use std::fs;
179        use std::io;
180        use std::io::BufRead;
181
182        let vocab = io::BufReader::new(fs::File::open("test_data/voc_it.txt").unwrap());
183        let result = io::BufReader::new(fs::File::open("test_data/res_it.txt").unwrap());
184
185        let lines = vocab.lines().zip(result.lines());
186
187        for (voc, res) in lines {
188            stemms_to(voc.unwrap().as_str(),
189                      res.unwrap().as_str(),
190                      Algorithm::Italian);
191        }
192    }
193
194    #[test]
195    fn romanian_test() {
196        use std::fs;
197        use std::io;
198        use std::io::BufRead;
199
200        let vocab = io::BufReader::new(fs::File::open("test_data/voc_ro.txt").unwrap());
201        let result = io::BufReader::new(fs::File::open("test_data/res_ro.txt").unwrap());
202
203        let lines = vocab.lines().zip(result.lines());
204
205        for (voc, res) in lines {
206            stemms_to(voc.unwrap().as_str(),
207                      res.unwrap().as_str(),
208                      Algorithm::Romanian);
209        }
210    }
211
212    #[test]
213    fn russian_test() {
214        use std::fs;
215        use std::io;
216        use std::io::BufRead;
217
218        let vocab = io::BufReader::new(fs::File::open("test_data/voc_ru.txt").unwrap());
219        let result = io::BufReader::new(fs::File::open("test_data/res_ru.txt").unwrap());
220
221        let lines = vocab.lines().zip(result.lines());
222
223        for (voc, res) in lines {
224            stemms_to(voc.unwrap().as_str(),
225                      res.unwrap().as_str(),
226                      Algorithm::Russian);
227        }
228    }
229
230    #[test]
231    fn arabic_test() {
232        use std::fs;
233        use std::io;
234        use std::io::BufRead;
235
236        let vocab = io::BufReader::new(fs::File::open("test_data/voc_ar.txt").unwrap());
237        let result = io::BufReader::new(fs::File::open("test_data/res_ar.txt").unwrap());
238
239        let lines = vocab.lines().zip(result.lines());
240
241        for (voc, res) in lines {
242            stemms_to(voc.unwrap().as_str(),
243                      res.unwrap().as_str(),
244                      Algorithm::Arabic);
245        }
246    }
247
248}