1use std::str;
4use crate::error::RnltkError;
5
6struct Stemmer {
7 bytes: Vec<u8>,
8 bytes_length: usize,
9 offset: usize,
10}
11
12impl Stemmer {
13 fn new(word: &str) -> Result<Stemmer, RnltkError> {
14 if !word.is_ascii() {
15 Err(RnltkError::StemNonAscii)
16 } else {
17 let bytes = word.to_ascii_lowercase().into_bytes();
18 let bytes_length = bytes.len();
19 Ok(Stemmer {
20 bytes,
21 bytes_length,
22 offset: 0
23 })
24 }
25 }
26
27 #[inline]
29 fn is_consonant(&self, index: usize) -> bool {
30 match self.bytes[index] {
31 b'a' | b'e' | b'i' | b'o' | b'u' => false,
32 b'y' => {
33 if index == 0 {
34 true
35 } else {
36 !self.is_consonant(index - 1)
37 }
38 }
39 _ => true,
40 }
41 }
42
43 fn measure(&self) -> usize {
55 let mut n = 0;
56 let mut index = 0;
57 let offset = self.offset;
58 loop {
59 if index >= offset {
60 return n;
61 }
62 if !self.is_consonant(index) {
63 break;
64 }
65 index += 1;
66 }
67 index += 1;
68 loop {
69 loop {
70 if index >= offset {
71 return n;
72 }
73 if self.is_consonant(index) {
74 break;
75 }
76 index += 1;
77 }
78 index += 1;
79 n += 1;
80 loop {
81 if index >= offset {
82 return n;
83 }
84 if !self.is_consonant(index) {
85 break;
86 }
87 index += 1;
88 }
89 index += 1;
90 }
91 }
92
93 fn has_vowel(&self) -> bool {
95 for index in 0..self.offset {
96 if !self.is_consonant(index) {
97 return true;
98 }
99 }
100 false
101 }
102
103 #[inline]
105 fn double_consonant(&self, index: usize) -> bool {
106 if index < 1 || self.bytes[index] != self.bytes[index - 1] {
107 false
108 } else {
109 self.is_consonant(index)
110 }
111 }
112
113 fn cvc(&self, index: usize) -> bool {
122 if index < 2 || !self.is_consonant(index) || self.is_consonant(index - 1) || !self.is_consonant(index - 2) {
123 false
124 } else {
125 !matches!(self.bytes[index], b'w' | b'x' | b'y')
126 }
127 }
128
129 fn ends(&mut self, _s: &str) -> bool {
131 let s = _s.as_bytes();
132 let len = s.len();
133 if len > self.bytes_length {
134 false
135 } else {
136 &self.bytes[self.bytes_length - len..self.bytes_length] == s
137 }
138 }
139
140 fn update_offset(&mut self, _s: &str) {
141 let s = _s.as_bytes();
142 let len = s.len();
143 self.offset = self.bytes_length - len;
144 }
145
146 fn set_to(&mut self, s: &str) {
149 let s = s.as_bytes();
150 let len = s.len();
151 for (index, byte) in s.iter().enumerate() {
152 self.bytes[self.offset + index] = *byte;
153 }
154 self.bytes_length = self.offset + len;
155 }
156
157 #[inline]
159 fn replace(&mut self, s: &str) {
160 if self.measure() > 0 {
161 self.set_to(s);
162 }
163 }
164
165 fn step1ab(&mut self) {
187 if self.bytes[self.bytes_length - 1] == b's' {
188 if self.ends("sses") {
189 self.update_offset("sses");
190 self.bytes_length -= 2;
191 } else if self.ends("ies") {
192 self.update_offset("ies");
193 self.set_to("i");
194 } else if self.bytes[self.bytes_length - 2] != b's' {
195 self.bytes_length -= 1;
196 }
197 }
198 if self.ends("eed") {
199 self.update_offset("eed");
200 if self.measure() > 0 {
201 self.bytes_length -= 1
202 }
203 } else if self.ends("ed") || self.ends("ing") {
204 if self.ends("ed") {
205 self.update_offset("ed");
206 } else if self.ends("ing") {
207 self.update_offset("ing");
208 }
209 if self.has_vowel() {
210 self.bytes_length = self.offset;
211 if self.ends("at") {
212 self.update_offset("at");
213 self.set_to("ate");
214 } else if self.ends("bl") {
215 self.update_offset("bl");
216 self.set_to("ble");
217 } else if self.ends("iz") {
218 self.update_offset("iz");
219 self.set_to("ize");
220 } else if self.double_consonant(self.bytes_length - 1) {
221 self.bytes_length -= 1;
222 match self.bytes[self.bytes_length - 1] {
223 b'l' | b's' | b'z' => self.bytes_length += 1,
224 _ => (),
225 }
226 } else if self.measure() == 1 && self.cvc(self.bytes_length - 1) {
227 self.set_to("e");
228 }
229 }
230 }
231 }
232
233 fn step1c(&mut self) {
235 if self.ends("y") {
236 self.update_offset("y");
237 if self.has_vowel() {
238 self.bytes[self.bytes_length - 1] = b'i';
239 }
240 }
241 }
242
243 fn step2(&mut self) {
247 match self.bytes[self.bytes_length - 2] {
248 b'a' => {
249 if self.ends("ational") {
250 self.update_offset("ational");
251 self.replace("ate");
252 } else if self.ends("tional") {
253 self.update_offset("tional");
254 self.replace("tion");
255 }
256 }
257 b'c' => {
258 if self.ends("enci") {
259 self.update_offset("enci");
260 self.replace("ence");
261 } else if self.ends("anci") {
262 self.update_offset("anci");
263 self.replace("ance");
264 }
265 }
266 b'e' => {
267 if self.ends("izer") {
268 self.update_offset("izer");
269 self.replace("ize");
270 }
271 }
272 b'l' => {
273 if self.ends("bli") {
274 self.update_offset("bli");
275 self.replace("ble");
276 } else if self.ends("alli") {
282 self.update_offset("alli");
283 self.replace("al");
284 } else if self.ends("entli") {
285 self.update_offset("entli");
286 self.replace("ent");
287 } else if self.ends("eli") {
288 self.update_offset("eli");
289 self.replace("e");
290 } else if self.ends("ousli") {
291 self.update_offset("ousli");
292 self.replace("ous");
293 }
294 }
295 b'o' => {
296 if self.ends("ization") {
297 self.update_offset("ization");
298 self.replace("ize");
299 } else if self.ends("ation") {
300 self.update_offset("ation");
301 self.replace("ate");
302 } else if self.ends("ator") {
303 self.update_offset("ator");
304 self.replace("ate");
305 }
306 }
307 b's' => {
308 if self.ends("alism") {
309 self.update_offset("alism");
310 self.replace("al");
311 } else if self.ends("iveness") {
312 self.update_offset("iveness");
313 self.replace("ive");
314 } else if self.ends("fulness") {
315 self.update_offset("fulness");
316 self.replace("ful");
317 } else if self.ends("ousness") {
318 self.update_offset("ousness");
319 self.replace("ous");
320 }
321 }
322 b't' => {
323 if self.ends("aliti") {
324 self.update_offset("aliti");
325 self.replace("al");
326 } else if self.ends("iviti") {
327 self.update_offset("iviti");
328 self.replace("ive");
329 } else if self.ends("biliti") {
330 self.update_offset("biliti");
331 self.replace("ble");
332 }
333 }
334 b'g' => {
335 if self.ends("logi") {
336 self.update_offset("logi");
337 self.replace("log");
338 }
339 } _ => (),
342 }
343 }
344
345 fn step3(&mut self) {
347 match self.bytes[self.bytes_length - 1] {
348 b'e' => {
349 if self.ends("icate") {
350 self.update_offset("icate");
351 self.replace("ic");
352 } else if self.ends("ative") {
353 self.update_offset("ative");
354 self.replace("");
355 } else if self.ends("alize") {
356 self.update_offset("alize");
357 self.replace("al");
358 }
359 }
360 b'i' => {
361 if self.ends("iciti") {
362 self.update_offset("iciti");
363 self.replace("ic");
364 }
365 }
366 b'l' => {
367 if self.ends("ical") {
368 self.update_offset("ical");
369 self.replace("ic");
370 } else if self.ends("ful") {
371 self.update_offset("ful");
372 self.replace("");
373 }
374 }
375 b's' => {
376 if self.ends("ness") {
377 self.update_offset("ness");
378 self.replace("");
379 }
380 }
381 _ => (),
382 }
383 }
384
385 fn step4(&mut self) {
387 let mut byte_was_matched = false;
388 match self.bytes[self.bytes_length - 2] {
389 b'a' => {
390 if self.ends("al") {
391 self.update_offset("al");
392 byte_was_matched = true;
393 }
394 }
395 b'c' => {
396 if self.ends("ance") {
397 self.update_offset("ance");
398 byte_was_matched = true;
399 } else if self.ends("ence") {
400 self.update_offset("ence");
401 byte_was_matched = true;
402 }
403 }
404 b'e' => {
405 if self.ends("er") {
406 self.update_offset("er");
407 byte_was_matched = true;
408 }
409 }
410 b'i' => {
411 if self.ends("ic") {
412 self.update_offset("ic");
413 byte_was_matched = true;
414 }
415 }
416 b'l' => {
417 if self.ends("able") {
418 self.update_offset("able");
419 byte_was_matched = true;
420 } else if self.ends("ible") {
421 self.update_offset("ible");
422 byte_was_matched = true;
423 }
424 }
425 b'n' => {
426 if self.ends("ant") {
427 self.update_offset("ant");
428 byte_was_matched = true;
429 } else if self.ends("ement") {
430 self.update_offset("ement");
431 byte_was_matched = true;
432 } else if self.ends("ment") {
433 self.update_offset("ment");
434 byte_was_matched = true;
435 } else if self.ends("ent") {
436 self.update_offset("ent");
437 byte_was_matched = true;
438 }
439 }
440 b'o' => {
441 if self.ends("ion") {
442 self.update_offset("ion");
443 if self.offset > 0 && (self.bytes[self.offset - 1] == b's' || self.bytes[self.offset - 1] == b't') {
444 byte_was_matched = true;
445 }
446 } else if self.ends("ou") {
447 self.update_offset("ou");
448 byte_was_matched = true;
449 }
450 }
452 b's' => {
453 if self.ends("ism") {
454 self.update_offset("ism");
455 byte_was_matched = true;
456 }
457 }
458 b't' => {
459 if self.ends("ate") {
460 self.update_offset("ate");
461 byte_was_matched = true;
462 } else if self.ends("iti") {
463 self.update_offset("iti");
464 byte_was_matched = true;
465 }
466 }
467 b'u' => {
468 if self.ends("ous") {
469 self.update_offset("ous");
470 byte_was_matched = true;
471 }
472 }
473 b'v' => {
474 if self.ends("ive") {
475 self.update_offset("ive");
476 byte_was_matched = true;
477 }
478 }
479 b'z' => {
480 if self.ends("ize") {
481 self.update_offset("ize");
482 byte_was_matched = true;
483 }
484 }
485 _ => return,
486 }
487 if byte_was_matched && self.measure() > 1 {
488 self.bytes_length = self.offset
489 }
490 }
491
492 fn step5(&mut self) {
495 self.offset = self.bytes_length;
496 if self.bytes[self.bytes_length - 1] == b'e' {
497 let a = self.measure();
498 if a > 1 || a == 1 && !self.cvc(self.bytes_length - 2) {
499 self.bytes_length -= 1
500 }
501 }
502 if self.bytes[self.bytes_length - 1] == b'l' && self.double_consonant(self.bytes_length - 1) && self.measure() > 1 {
503 self.bytes_length -= 1;
504 }
505 }
506
507 #[inline]
508 fn get(&self) -> String {
509 unsafe { str::from_utf8_unchecked(&self.bytes[..self.bytes_length]).to_owned() }
510 }
511}
512
513pub fn get(word: &str) -> Result<String, RnltkError> {
537 if word.len() > 2 {
538 let mut mw = Stemmer::new(word)?;
539 mw.step1ab();
540 mw.step1c();
541 mw.step2();
542 mw.step3();
543 mw.step4();
544 mw.step5();
545 Ok(mw.get())
546 } else {
547 Ok(word.to_owned())
548 }
549}
550
551#[cfg(test)]
552mod test_stem {
553 use super::get;
554 use std::ops::Deref;
555
556 pub static INPUT: &str = include_str!("../test_data/voc.txt");
557 pub static RESULT: &str = include_str!("../test_data/output.txt");
558
559 fn test_loop<I0: Iterator<Item = T>, I1: Iterator<Item = T>, T: Deref<Target = str>>(
560 tests: I0,
561 results: I1,
562 ) {
563 for (test, expect) in tests.zip(results) {
564 let test = test.trim_end();
565 let expect = expect.trim_end();
566 let stemmed = get(test.trim_end());
567
568 assert!(stemmed.is_ok(), "[FAILED] Expected stem for '{}'", test);
569 assert_eq!(stemmed.unwrap().trim_end(), expect);
570 }
571 }
572
573 #[test]
574 fn lexicon() {
575 let input_s = INPUT.split('\n');
576 let result_s = RESULT.split('\n');
577
578 test_loop(input_s, result_s);
579 }
580}