tantivy_stemmers/
lib.rs

1use std::borrow::Cow;
2use std::mem;
3use tantivy_tokenizer_api::{Token, TokenFilter, TokenStream, Tokenizer};
4
5mod snowball;
6
7pub mod algorithms;
8
9/// Stemmer tokenizer. Several algorithms are supported, see [`algorithms`] or
10/// https://crates.io/crates/tantivy-stemmers for a list of all available algorithms.
11///
12/// ❗️❗️ Tokens are expected to be lowercased beforehand.
13#[derive(Clone)]
14pub struct StemmerTokenizer {
15    algorithm: algorithms::Algorithm,
16}
17
18impl StemmerTokenizer {
19    /// Creates a new `StemmerTokenizer` [`StemmerTokenizer`] for a given language or variant algorithm.
20    pub fn new(algorithm: algorithms::Algorithm) -> StemmerTokenizer {
21        StemmerTokenizer { algorithm }
22    }
23}
24
25#[cfg(feature = "english_porter_2")]
26impl Default for StemmerTokenizer {
27    /// Creates a new `StemmerTokenizer` [`StemmerTokenizer`] the default algorithm [`algorithms::english_porter_2`].
28    fn default() -> Self {
29        StemmerTokenizer::new(algorithms::english_porter_2)
30    }
31}
32
33impl TokenFilter for StemmerTokenizer {
34    type Tokenizer<T: Tokenizer> = StemmerFilter<T>;
35
36    fn transform<T: Tokenizer>(self, tokenizer: T) -> StemmerFilter<T> {
37        StemmerFilter {
38            algorithm: self.algorithm,
39            inner: tokenizer,
40        }
41    }
42}
43
44#[derive(Clone)]
45pub struct StemmerFilter<T> {
46    algorithm: algorithms::Algorithm,
47    inner: T,
48}
49
50impl<T: Tokenizer> Tokenizer for StemmerFilter<T> {
51    type TokenStream<'a> = StemmerTokenStream<T::TokenStream<'a>>;
52
53    fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
54        StemmerTokenStream {
55            tail: self.inner.token_stream(text),
56            buffer: String::new(),
57            algorithm: self.algorithm,
58        }
59    }
60}
61
62pub struct StemmerTokenStream<T> {
63    tail: T,
64    buffer: String,
65    algorithm: algorithms::Algorithm,
66}
67
68impl<T: TokenStream> TokenStream for StemmerTokenStream<T> {
69    fn advance(&mut self) -> bool {
70        if !self.tail.advance() {
71            return false;
72        }
73
74        let token = self.tail.token_mut();
75
76        match (self.algorithm)(&token.text) {
77            Cow::Owned(stemmed_str) => token.text = stemmed_str,
78            Cow::Borrowed(stemmed_str) => {
79                self.buffer.clear();
80                self.buffer.push_str(stemmed_str);
81                mem::swap(&mut token.text, &mut self.buffer);
82            }
83        }
84
85        true
86    }
87
88    fn token(&self) -> &Token {
89        self.tail.token()
90    }
91
92    fn token_mut(&mut self) -> &mut Token {
93        self.tail.token_mut()
94    }
95}