lance_tokenizer/
stemmer.rs1use std::borrow::Cow;
8use std::mem;
9
10use rust_stemmers::Algorithm;
11use serde::{Deserialize, Serialize};
12
13use crate::{Token, TokenFilter, TokenStream, Tokenizer};
14
15#[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
16pub enum Language {
17 Arabic,
18 Danish,
19 Dutch,
20 English,
21 Finnish,
22 French,
23 German,
24 Greek,
25 Hungarian,
26 Italian,
27 Norwegian,
28 Portuguese,
29 Romanian,
30 Russian,
31 Spanish,
32 Swedish,
33 Tamil,
34 Turkish,
35}
36
37impl Language {
38 fn algorithm(self) -> Algorithm {
39 match self {
40 Self::Arabic => Algorithm::Arabic,
41 Self::Danish => Algorithm::Danish,
42 Self::Dutch => Algorithm::Dutch,
43 Self::English => Algorithm::English,
44 Self::Finnish => Algorithm::Finnish,
45 Self::French => Algorithm::French,
46 Self::German => Algorithm::German,
47 Self::Greek => Algorithm::Greek,
48 Self::Hungarian => Algorithm::Hungarian,
49 Self::Italian => Algorithm::Italian,
50 Self::Norwegian => Algorithm::Norwegian,
51 Self::Portuguese => Algorithm::Portuguese,
52 Self::Romanian => Algorithm::Romanian,
53 Self::Russian => Algorithm::Russian,
54 Self::Spanish => Algorithm::Spanish,
55 Self::Swedish => Algorithm::Swedish,
56 Self::Tamil => Algorithm::Tamil,
57 Self::Turkish => Algorithm::Turkish,
58 }
59 }
60}
61
62#[derive(Clone)]
63pub struct Stemmer {
64 stemmer_algorithm: Algorithm,
65}
66
67impl Stemmer {
68 pub fn new(language: Language) -> Self {
69 Self {
70 stemmer_algorithm: language.algorithm(),
71 }
72 }
73}
74
75impl Default for Stemmer {
76 fn default() -> Self {
77 Self::new(Language::English)
78 }
79}
80
81impl TokenFilter for Stemmer {
82 type Tokenizer<T: Tokenizer> = StemmerFilter<T>;
83
84 fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T> {
85 StemmerFilter {
86 stemmer_algorithm: self.stemmer_algorithm,
87 inner: tokenizer,
88 }
89 }
90}
91
92#[derive(Clone)]
93pub struct StemmerFilter<T> {
94 stemmer_algorithm: Algorithm,
95 inner: T,
96}
97
98impl<T: Tokenizer> Tokenizer for StemmerFilter<T> {
99 type TokenStream<'a> = StemmerTokenStream<T::TokenStream<'a>>;
100
101 fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
102 StemmerTokenStream {
103 tail: self.inner.token_stream(text),
104 stemmer: rust_stemmers::Stemmer::create(self.stemmer_algorithm),
105 buffer: String::new(),
106 }
107 }
108}
109
110pub struct StemmerTokenStream<T> {
111 tail: T,
112 stemmer: rust_stemmers::Stemmer,
113 buffer: String,
114}
115
116impl<T: TokenStream> TokenStream for StemmerTokenStream<T> {
117 fn advance(&mut self) -> bool {
118 if !self.tail.advance() {
119 return false;
120 }
121 let token = self.tail.token_mut();
122 let stemmed = self.stemmer.stem(&token.text);
123 match stemmed {
124 Cow::Owned(stemmed) => token.text = stemmed,
125 Cow::Borrowed(stemmed) => {
126 self.buffer.clear();
127 self.buffer.push_str(stemmed);
128 mem::swap(&mut token.text, &mut self.buffer);
129 }
130 }
131 true
132 }
133
134 fn token(&self) -> &Token {
135 self.tail.token()
136 }
137
138 fn token_mut(&mut self) -> &mut Token {
139 self.tail.token_mut()
140 }
141}