charabia/normalizer/
classify.rs

1use std::collections::HashSet;
2
3use fst::Set;
4use once_cell::sync::Lazy;
5
6use super::{Normalizer, NormalizerOption};
7use crate::{SeparatorKind, Token, TokenKind};
8
9/// Classify a Token as a word, a stop_word or a separator.
10///
11/// Assign to each [`Token`]s a [`TokenKind`] using provided stop words.
12///
13/// [`TokenKind`]: crate::TokenKind
14///
15/// Any `Token` that is in the stop words [`Set`] is assigned to [`TokenKind::StopWord`].
16///
17/// [`TokenKind::StopWord`]: crate::TokenKind#StopWord
18pub struct Classifier;
19
20impl Normalizer for Classifier {
21    fn normalize<'o>(&self, mut token: Token<'o>, options: &NormalizerOption) -> Token<'o> {
22        token.kind = TokenKind::Word;
23        let lemma = token.lemma();
24
25        if let Some(stop_words) = &options.classifier.stop_words {
26            if stop_words.contains(lemma) {
27                token.kind = TokenKind::StopWord;
28                return token;
29            }
30        }
31
32        match options.classifier.separators {
33            Some(separators) if separators.contains(&lemma) => {
34                token.kind = TokenKind::Separator(separator_kind(lemma));
35            }
36            None if DEFAULT_SEPARATOR_SET.contains(lemma) => {
37                token.kind = TokenKind::Separator(separator_kind(lemma));
38            }
39            _otherwise => (),
40        }
41
42        token
43    }
44
45    fn should_normalize(&self, token: &Token) -> bool {
46        token.kind == TokenKind::Unknown
47    }
48}
49
50/// Structure for providing options to the classfier.
51#[derive(Debug, Clone, Default)]
52pub struct ClassifierOption<'no> {
53    pub stop_words: Option<Set<&'no [u8]>>,
54    pub separators: Option<&'no [&'no str]>,
55}
56
57fn separator_kind(lemma: &str) -> SeparatorKind {
58    if CONTEXT_SEPARATOR_SET.contains(lemma) {
59        SeparatorKind::Hard
60    } else {
61        SeparatorKind::Soft
62    }
63}
64
65pub static DEFAULT_SEPARATOR_SET: Lazy<HashSet<&str>> =
66    Lazy::new(|| crate::separators::DEFAULT_SEPARATORS.iter().copied().collect());
67
68pub static CONTEXT_SEPARATOR_SET: Lazy<HashSet<&str>> =
69    Lazy::new(|| crate::separators::CONTEXT_SEPARATORS.iter().copied().collect());
70
71#[cfg(test)]
72mod test {
73    use std::borrow::Cow;
74
75    use crate::normalizer::test::test_normalizer;
76
77    // base tokens to normalize.
78    fn tokens() -> Vec<Token<'static>> {
79        vec![
80            Token { lemma: Cow::Borrowed(" "), ..Default::default() },
81            Token { lemma: Cow::Borrowed("\""), ..Default::default() },
82            Token { lemma: Cow::Borrowed("@"), ..Default::default() },
83            Token { lemma: Cow::Borrowed("."), ..Default::default() },
84            Token { lemma: Cow::Borrowed(". "), ..Default::default() },
85            Token { lemma: Cow::Borrowed("。"), ..Default::default() },
86            Token { lemma: Cow::Borrowed("S.O.S"), ..Default::default() },
87            Token { lemma: Cow::Borrowed("ь"), ..Default::default() },
88        ]
89    }
90
91    // expected result of the current Normalizer.
92    fn normalizer_result() -> Vec<Token<'static>> {
93        vec![
94            Token {
95                lemma: Cow::Borrowed(" "),
96                kind: TokenKind::Separator(SeparatorKind::Soft),
97                ..Default::default()
98            },
99            Token {
100                lemma: Cow::Borrowed("\""),
101                kind: TokenKind::Separator(SeparatorKind::Soft),
102                ..Default::default()
103            },
104            Token {
105                lemma: Cow::Borrowed("@"),
106                kind: TokenKind::Separator(SeparatorKind::Soft),
107                ..Default::default()
108            },
109            Token {
110                lemma: Cow::Borrowed("."),
111                kind: TokenKind::Separator(SeparatorKind::Soft),
112                ..Default::default()
113            },
114            Token {
115                lemma: Cow::Borrowed(". "),
116                kind: TokenKind::Separator(SeparatorKind::Hard),
117                ..Default::default()
118            },
119            Token {
120                lemma: Cow::Borrowed("。"),
121                kind: TokenKind::Separator(SeparatorKind::Hard),
122                ..Default::default()
123            },
124            Token { lemma: Cow::Borrowed("S.O.S"), kind: TokenKind::Word, ..Default::default() },
125            Token { lemma: Cow::Borrowed("ь"), kind: TokenKind::Word, ..Default::default() },
126        ]
127    }
128
129    // expected result of the complete Normalizer pieline.
130    fn normalized_tokens() -> Vec<Token<'static>> {
131        vec![
132            Token {
133                lemma: Cow::Borrowed(" "),
134                kind: TokenKind::Separator(SeparatorKind::Soft),
135                ..Default::default()
136            },
137            Token {
138                lemma: Cow::Borrowed("\""),
139                kind: TokenKind::Separator(SeparatorKind::Soft),
140                ..Default::default()
141            },
142            Token {
143                lemma: Cow::Borrowed("@"),
144                kind: TokenKind::Separator(SeparatorKind::Soft),
145                ..Default::default()
146            },
147            Token {
148                lemma: Cow::Borrowed("."),
149                kind: TokenKind::Separator(SeparatorKind::Soft),
150                ..Default::default()
151            },
152            Token {
153                lemma: Cow::Borrowed(". "),
154                kind: TokenKind::Separator(SeparatorKind::Hard),
155                ..Default::default()
156            },
157            Token {
158                lemma: Cow::Borrowed("。"),
159                kind: TokenKind::Separator(SeparatorKind::Hard),
160                ..Default::default()
161            },
162            Token { lemma: Cow::Borrowed("S.O.S"), kind: TokenKind::Word, ..Default::default() },
163            Token { lemma: Cow::Borrowed("ь"), kind: TokenKind::Word, ..Default::default() },
164        ]
165    }
166
167    test_normalizer!(Classifier, tokens(), normalizer_result(), normalized_tokens());
168
169    #[test]
170    fn stop_words() {
171        let stop_words = Set::from_iter(["the"].iter()).unwrap();
172        let stop_words = stop_words.as_fst().as_bytes();
173        let stop_words = Set::new(stop_words).unwrap();
174        let options = NormalizerOption {
175            create_char_map: true,
176            classifier: ClassifierOption { stop_words: Some(stop_words), separators: None },
177            lossy: false,
178        };
179
180        let token = Classifier
181            .normalize(Token { lemma: Cow::Borrowed("the"), ..Default::default() }, &options);
182        assert!(token.is_stopword());
183
184        let token = Classifier
185            .normalize(Token { lemma: Cow::Borrowed("The"), ..Default::default() }, &options);
186        assert!(token.is_word());
187
188        let token = Classifier
189            .normalize(Token { lemma: Cow::Borrowed("foobar"), ..Default::default() }, &options);
190        assert!(token.is_word());
191    }
192
193    #[quickcheck]
194    fn is_stop_word_iff_stop_words_contain_lemma(
195        mut stop_words: Vec<String>,
196        lemma: String,
197        create_char_map: bool,
198        lossy: bool,
199        containing: bool,
200    ) {
201        if containing {
202            stop_words.push(lemma.clone());
203        } else {
204            stop_words.retain(|w| w != &lemma);
205        }
206
207        stop_words.sort();
208        let stop_words = Set::from_iter(stop_words.iter()).unwrap();
209        let stop_words = stop_words.as_fst().as_bytes();
210        let stop_words = Set::new(stop_words).unwrap();
211        let options = NormalizerOption {
212            create_char_map,
213            classifier: ClassifierOption { stop_words: Some(stop_words), separators: None },
214            lossy,
215        };
216
217        let token = Classifier
218            .normalize(Token { lemma: Cow::Borrowed(&lemma), ..Default::default() }, &options);
219        assert_eq!(token.is_stopword(), containing);
220    }
221
222    #[quickcheck]
223    fn is_separator_if_separators_contain_lemma(
224        mut separators: Vec<String>,
225        lemma: String,
226        create_char_map: bool,
227        lossy: bool,
228        containing: bool,
229    ) {
230        if containing {
231            separators.push(lemma.clone());
232        } else {
233            separators.retain(|w| w != &lemma);
234        }
235        let separators: Vec<&str> = separators.iter().map(|s| s.as_str()).collect();
236        let options = NormalizerOption {
237            create_char_map,
238            classifier: ClassifierOption { stop_words: None, separators: Some(&separators) },
239            lossy,
240        };
241
242        let token = Classifier
243            .normalize(Token { lemma: Cow::Borrowed(&lemma), ..Default::default() }, &options);
244        assert_eq!(token.is_separator(), containing);
245        if containing {
246            assert!(token.is_separator());
247        }
248    }
249
250    #[quickcheck]
251    fn is_stop_word_if_both_stop_works_and_separators_contain_lemma(
252        mut stop_words_and_separators: Vec<String>,
253        lemma: String,
254        create_char_map: bool,
255        lossy: bool,
256    ) {
257        stop_words_and_separators.push(lemma.clone());
258        stop_words_and_separators.sort();
259        let stop_words = Set::from_iter(stop_words_and_separators.iter()).unwrap();
260        let stop_words = stop_words.as_fst().as_bytes();
261        let stop_words = Set::new(stop_words).unwrap();
262        let separators: Vec<&str> = stop_words_and_separators.iter().map(|s| s.as_str()).collect();
263        let options = NormalizerOption {
264            create_char_map,
265            classifier: ClassifierOption {
266                stop_words: Some(stop_words),
267                separators: Some(&separators),
268            },
269            lossy,
270        };
271
272        let token = Classifier
273            .normalize(Token { lemma: Cow::Borrowed(&lemma), ..Default::default() }, &options);
274        assert!(token.is_stopword());
275        assert!(!token.is_separator());
276    }
277}