1use std::collections::HashSet;
2
3use fst::Set;
4use once_cell::sync::Lazy;
5
6use super::{Normalizer, NormalizerOption};
7use crate::{SeparatorKind, Token, TokenKind};
8
9pub struct Classifier;
19
20impl Normalizer for Classifier {
21 fn normalize<'o>(&self, mut token: Token<'o>, options: &NormalizerOption) -> Token<'o> {
22 token.kind = TokenKind::Word;
23 let lemma = token.lemma();
24
25 if let Some(stop_words) = &options.classifier.stop_words {
26 if stop_words.contains(lemma) {
27 token.kind = TokenKind::StopWord;
28 return token;
29 }
30 }
31
32 match options.classifier.separators {
33 Some(separators) if separators.contains(&lemma) => {
34 token.kind = TokenKind::Separator(separator_kind(lemma));
35 }
36 None if DEFAULT_SEPARATOR_SET.contains(lemma) => {
37 token.kind = TokenKind::Separator(separator_kind(lemma));
38 }
39 _otherwise => (),
40 }
41
42 token
43 }
44
45 fn should_normalize(&self, token: &Token) -> bool {
46 token.kind == TokenKind::Unknown
47 }
48}
49
50#[derive(Debug, Clone, Default)]
52pub struct ClassifierOption<'no> {
53 pub stop_words: Option<Set<&'no [u8]>>,
54 pub separators: Option<&'no [&'no str]>,
55}
56
57fn separator_kind(lemma: &str) -> SeparatorKind {
58 if CONTEXT_SEPARATOR_SET.contains(lemma) {
59 SeparatorKind::Hard
60 } else {
61 SeparatorKind::Soft
62 }
63}
64
65pub static DEFAULT_SEPARATOR_SET: Lazy<HashSet<&str>> =
66 Lazy::new(|| crate::separators::DEFAULT_SEPARATORS.iter().copied().collect());
67
68pub static CONTEXT_SEPARATOR_SET: Lazy<HashSet<&str>> =
69 Lazy::new(|| crate::separators::CONTEXT_SEPARATORS.iter().copied().collect());
70
71#[cfg(test)]
72mod test {
73 use std::borrow::Cow;
74
75 use crate::normalizer::test::test_normalizer;
76
77 fn tokens() -> Vec<Token<'static>> {
79 vec![
80 Token { lemma: Cow::Borrowed(" "), ..Default::default() },
81 Token { lemma: Cow::Borrowed("\""), ..Default::default() },
82 Token { lemma: Cow::Borrowed("@"), ..Default::default() },
83 Token { lemma: Cow::Borrowed("."), ..Default::default() },
84 Token { lemma: Cow::Borrowed(". "), ..Default::default() },
85 Token { lemma: Cow::Borrowed("。"), ..Default::default() },
86 Token { lemma: Cow::Borrowed("S.O.S"), ..Default::default() },
87 Token { lemma: Cow::Borrowed("ь"), ..Default::default() },
88 ]
89 }
90
91 fn normalizer_result() -> Vec<Token<'static>> {
93 vec![
94 Token {
95 lemma: Cow::Borrowed(" "),
96 kind: TokenKind::Separator(SeparatorKind::Soft),
97 ..Default::default()
98 },
99 Token {
100 lemma: Cow::Borrowed("\""),
101 kind: TokenKind::Separator(SeparatorKind::Soft),
102 ..Default::default()
103 },
104 Token {
105 lemma: Cow::Borrowed("@"),
106 kind: TokenKind::Separator(SeparatorKind::Soft),
107 ..Default::default()
108 },
109 Token {
110 lemma: Cow::Borrowed("."),
111 kind: TokenKind::Separator(SeparatorKind::Soft),
112 ..Default::default()
113 },
114 Token {
115 lemma: Cow::Borrowed(". "),
116 kind: TokenKind::Separator(SeparatorKind::Hard),
117 ..Default::default()
118 },
119 Token {
120 lemma: Cow::Borrowed("。"),
121 kind: TokenKind::Separator(SeparatorKind::Hard),
122 ..Default::default()
123 },
124 Token { lemma: Cow::Borrowed("S.O.S"), kind: TokenKind::Word, ..Default::default() },
125 Token { lemma: Cow::Borrowed("ь"), kind: TokenKind::Word, ..Default::default() },
126 ]
127 }
128
129 fn normalized_tokens() -> Vec<Token<'static>> {
131 vec![
132 Token {
133 lemma: Cow::Borrowed(" "),
134 kind: TokenKind::Separator(SeparatorKind::Soft),
135 ..Default::default()
136 },
137 Token {
138 lemma: Cow::Borrowed("\""),
139 kind: TokenKind::Separator(SeparatorKind::Soft),
140 ..Default::default()
141 },
142 Token {
143 lemma: Cow::Borrowed("@"),
144 kind: TokenKind::Separator(SeparatorKind::Soft),
145 ..Default::default()
146 },
147 Token {
148 lemma: Cow::Borrowed("."),
149 kind: TokenKind::Separator(SeparatorKind::Soft),
150 ..Default::default()
151 },
152 Token {
153 lemma: Cow::Borrowed(". "),
154 kind: TokenKind::Separator(SeparatorKind::Hard),
155 ..Default::default()
156 },
157 Token {
158 lemma: Cow::Borrowed("。"),
159 kind: TokenKind::Separator(SeparatorKind::Hard),
160 ..Default::default()
161 },
162 Token { lemma: Cow::Borrowed("S.O.S"), kind: TokenKind::Word, ..Default::default() },
163 Token { lemma: Cow::Borrowed("ь"), kind: TokenKind::Word, ..Default::default() },
164 ]
165 }
166
167 test_normalizer!(Classifier, tokens(), normalizer_result(), normalized_tokens());
168
169 #[test]
170 fn stop_words() {
171 let stop_words = Set::from_iter(["the"].iter()).unwrap();
172 let stop_words = stop_words.as_fst().as_bytes();
173 let stop_words = Set::new(stop_words).unwrap();
174 let options = NormalizerOption {
175 create_char_map: true,
176 classifier: ClassifierOption { stop_words: Some(stop_words), separators: None },
177 lossy: false,
178 };
179
180 let token = Classifier
181 .normalize(Token { lemma: Cow::Borrowed("the"), ..Default::default() }, &options);
182 assert!(token.is_stopword());
183
184 let token = Classifier
185 .normalize(Token { lemma: Cow::Borrowed("The"), ..Default::default() }, &options);
186 assert!(token.is_word());
187
188 let token = Classifier
189 .normalize(Token { lemma: Cow::Borrowed("foobar"), ..Default::default() }, &options);
190 assert!(token.is_word());
191 }
192
193 #[quickcheck]
194 fn is_stop_word_iff_stop_words_contain_lemma(
195 mut stop_words: Vec<String>,
196 lemma: String,
197 create_char_map: bool,
198 lossy: bool,
199 containing: bool,
200 ) {
201 if containing {
202 stop_words.push(lemma.clone());
203 } else {
204 stop_words.retain(|w| w != &lemma);
205 }
206
207 stop_words.sort();
208 let stop_words = Set::from_iter(stop_words.iter()).unwrap();
209 let stop_words = stop_words.as_fst().as_bytes();
210 let stop_words = Set::new(stop_words).unwrap();
211 let options = NormalizerOption {
212 create_char_map,
213 classifier: ClassifierOption { stop_words: Some(stop_words), separators: None },
214 lossy,
215 };
216
217 let token = Classifier
218 .normalize(Token { lemma: Cow::Borrowed(&lemma), ..Default::default() }, &options);
219 assert_eq!(token.is_stopword(), containing);
220 }
221
222 #[quickcheck]
223 fn is_separator_if_separators_contain_lemma(
224 mut separators: Vec<String>,
225 lemma: String,
226 create_char_map: bool,
227 lossy: bool,
228 containing: bool,
229 ) {
230 if containing {
231 separators.push(lemma.clone());
232 } else {
233 separators.retain(|w| w != &lemma);
234 }
235 let separators: Vec<&str> = separators.iter().map(|s| s.as_str()).collect();
236 let options = NormalizerOption {
237 create_char_map,
238 classifier: ClassifierOption { stop_words: None, separators: Some(&separators) },
239 lossy,
240 };
241
242 let token = Classifier
243 .normalize(Token { lemma: Cow::Borrowed(&lemma), ..Default::default() }, &options);
244 assert_eq!(token.is_separator(), containing);
245 if containing {
246 assert!(token.is_separator());
247 }
248 }
249
250 #[quickcheck]
251 fn is_stop_word_if_both_stop_works_and_separators_contain_lemma(
252 mut stop_words_and_separators: Vec<String>,
253 lemma: String,
254 create_char_map: bool,
255 lossy: bool,
256 ) {
257 stop_words_and_separators.push(lemma.clone());
258 stop_words_and_separators.sort();
259 let stop_words = Set::from_iter(stop_words_and_separators.iter()).unwrap();
260 let stop_words = stop_words.as_fst().as_bytes();
261 let stop_words = Set::new(stop_words).unwrap();
262 let separators: Vec<&str> = stop_words_and_separators.iter().map(|s| s.as_str()).collect();
263 let options = NormalizerOption {
264 create_char_map,
265 classifier: ClassifierOption {
266 stop_words: Some(stop_words),
267 separators: Some(&separators),
268 },
269 lossy,
270 };
271
272 let token = Classifier
273 .normalize(Token { lemma: Cow::Borrowed(&lemma), ..Default::default() }, &options);
274 assert!(token.is_stopword());
275 assert!(!token.is_separator());
276 }
277}