charabia/tokenizer.rs
1use std::borrow::Cow;
2
3use aho_corasick::{AhoCorasick, MatchKind};
4use fst::Set;
5
6use crate::detection::Language;
7use crate::normalizer::{NormalizedTokenIter, NormalizerOption};
8use crate::segmenter::{Segment, SegmentedStrIter, SegmentedTokenIter, SegmenterOption};
9use crate::separators::DEFAULT_SEPARATORS;
10use crate::Token;
11
12/// Iterator over tuples of [`&str`] (part of the original text) and [`Token`].
13pub struct ReconstructedTokenIter<'o, 'aho, 'lang, 'tb> {
14 token_iter: NormalizedTokenIter<'o, 'aho, 'lang, 'tb>,
15 original: &'o str,
16}
17
18impl<'o> Iterator for ReconstructedTokenIter<'o, '_, '_, '_> {
19 type Item = (&'o str, Token<'o>);
20
21 fn next(&mut self) -> Option<Self::Item> {
22 self.token_iter
23 .next()
24 .map(|token| (&self.original[token.byte_start..token.byte_end], token))
25 }
26}
27
28/// Trait defining methods to tokenize a text.
29pub trait Tokenize<'o> {
30 /// Creates an Iterator over [`Token`]s.
31 ///
32 /// The provided text is segmented creating tokens,
33 /// then tokens are normalized and classified.
34 ///
35 /// # Example
36 ///
37 /// ```
38 /// use charabia::{Token, TokenKind, Tokenize, SeparatorKind};
39 ///
40 /// let orig = "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!";
41 ///
42 /// let mut tokens = orig.tokenize();
43 ///
44 /// let Token { lemma, kind, .. } = tokens.next().unwrap();
45 /// assert_eq!(lemma, "the");
46 /// assert_eq!(kind, TokenKind::Word);
47 ///
48 /// let Token { lemma, kind, .. } = tokens.next().unwrap();
49 /// assert_eq!(lemma, " ");
50 /// assert_eq!(kind, TokenKind::Separator(SeparatorKind::Soft));
51 ///
52 /// let Token { lemma, kind, .. } = tokens.next().unwrap();
53 /// assert_eq!(lemma, "quick");
54 /// assert_eq!(kind, TokenKind::Word);
55 /// ```
56 fn tokenize(&self) -> NormalizedTokenIter<'_, '_, '_, '_>;
57
58 /// Same as [`tokenize`] but attaches each [`Token`] to its corresponding portion of the original text.
59 ///
60 /// # Example
61 ///
62 /// ```
63 /// use charabia::{Token, TokenKind, Tokenize, SeparatorKind};
64 ///
65 /// let orig = "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!";
66 ///
67 /// let mut pairs = orig.reconstruct();
68 ///
69 /// let (s, Token { lemma, kind, .. }) = pairs.next().unwrap();
70 /// assert_eq!(s, "The");
71 /// assert_eq!(lemma, "the");
72 /// assert_eq!(kind, TokenKind::Word);
73 ///
74 /// let (s, Token { lemma, kind, .. }) = pairs.next().unwrap();
75 /// assert_eq!(s, " ");
76 /// assert_eq!(lemma, " ");
77 /// assert_eq!(kind, TokenKind::Separator(SeparatorKind::Soft));
78 ///
79 /// let (s, Token { lemma, kind, .. }) = pairs.next().unwrap();
80 /// assert_eq!(s, "quick");
81 /// assert_eq!(lemma, "quick");
82 /// assert_eq!(kind, TokenKind::Word);
83 /// ```
84 fn reconstruct(&self) -> ReconstructedTokenIter<'_, '_, '_, '_>;
85}
86
87impl Tokenize<'_> for &str {
88 fn tokenize(&self) -> NormalizedTokenIter<'_, '_, '_, '_> {
89 self.segment().normalize(&crate::normalizer::DEFAULT_NORMALIZER_OPTION)
90 }
91
92 fn reconstruct(&self) -> ReconstructedTokenIter<'_, '_, '_, '_> {
93 ReconstructedTokenIter { original: self, token_iter: self.tokenize() }
94 }
95}
96
97/// Structure used to tokenize a text with custom configurations.
98///
99/// See [`TokenizerBuilder`] to know how to build a [`Tokenizer`].
100#[derive(Debug)]
101pub struct Tokenizer<'tb> {
102 segmenter_option: Cow<'tb, SegmenterOption<'tb>>,
103 normalizer_option: Cow<'tb, NormalizerOption<'tb>>,
104}
105
106impl Tokenizer<'_> {
107 /// Creates an Iterator over [`Token`]s.
108 ///
109 /// The provided text is segmented creating tokens,
110 /// then tokens are normalized and classified depending on the list of normalizers and classifiers in [`normalizer::NORMALIZERS`].
111 pub fn tokenize<'t, 'o>(&'t self, original: &'o str) -> NormalizedTokenIter<'o, 't, 't, 't> {
112 original
113 .segment_with_option(
114 self.segmenter_option.aho.as_ref(),
115 self.segmenter_option.allow_list,
116 )
117 .normalize(&self.normalizer_option)
118 }
119
120 /// Creates an Iterator over [`Token`]s.
121 ///
122 /// The provided text is segmented creating tokens,
123 /// then tokens are normalized and classified depending on the list of normalizers and classifiers in [`normalizer::NORMALIZERS`].
124 ///
125 /// # Arguments
126 ///
127 /// * `allow_list` - a slice of [`Language`] to allow during autodetection.
128 pub fn tokenize_with_allow_list<'t, 'o, 'lang>(
129 &'t self,
130 original: &'o str,
131 allow_list: Option<&'lang [Language]>,
132 ) -> NormalizedTokenIter<'o, 't, 'lang, 't> {
133 original
134 .segment_with_option(self.segmenter_option.aho.as_ref(), allow_list)
135 .normalize(&self.normalizer_option)
136 }
137
138 /// Same as [`tokenize`] but attaches each [`Token`] to its corresponding portion of the original text.
139 pub fn reconstruct<'t, 'o>(
140 &'t self,
141 original: &'o str,
142 ) -> ReconstructedTokenIter<'o, 't, 't, 't> {
143 ReconstructedTokenIter { original, token_iter: self.tokenize(original) }
144 }
145
146 /// Segments the provided text creating an Iterator over [`Token`].
147 pub fn segment<'t, 'o>(&'t self, original: &'o str) -> SegmentedTokenIter<'o, 't, 't> {
148 original.segment_with_option(
149 self.segmenter_option.aho.as_ref(),
150 self.segmenter_option.allow_list,
151 )
152 }
153
154 /// Segments the provided text creating an Iterator over `&str`.
155 pub fn segment_str<'t, 'o>(&'t self, original: &'o str) -> SegmentedStrIter<'o, 't, 't> {
156 original.segment_str_with_option(
157 self.segmenter_option.aho.as_ref(),
158 self.segmenter_option.allow_list,
159 )
160 }
161}
162
163/// Structure to build a tokenizer with custom settings.
164///
165/// To use default settings, use directly the `Tokenize` implementation on &str.
166///
167/// # Example
168///
169/// ```
170/// use fst::Set;
171///
172/// use charabia::TokenizerBuilder;
173///
174/// // text to tokenize.
175/// let orig = "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!";
176///
177/// // create the builder.
178/// let mut builder = TokenizerBuilder::new();
179///
180/// // create a set of stop words.
181/// let stop_words: Set<Vec<u8>> = Set::from_iter(["the"].iter()).unwrap();
182///
183/// // configurate stop words.
184/// builder.stop_words(&stop_words);
185///
186/// // build the tokenizer passing the text to tokenize.
187/// let tokenizer = builder.build();
188/// ```
189///
190pub struct TokenizerBuilder<'tb, A> {
191 stop_words: Option<&'tb Set<A>>,
192 words_dict: Option<&'tb [&'tb str]>,
193 normalizer_option: NormalizerOption<'tb>,
194 segmenter_option: SegmenterOption<'tb>,
195}
196
197impl<'tb, A> TokenizerBuilder<'tb, A> {
198 /// Create a `TokenizerBuilder` with default settings,
199 ///
200 /// if you don't plan to set stop_words, prefer use [`TokenizerBuilder::default`]
201 pub fn new() -> TokenizerBuilder<'tb, A> {
202 Self {
203 normalizer_option: crate::normalizer::DEFAULT_NORMALIZER_OPTION,
204 segmenter_option: SegmenterOption::default(),
205 stop_words: None,
206 words_dict: None,
207 }
208 }
209}
210
211impl<'tb, A: AsRef<[u8]>> TokenizerBuilder<'tb, A> {
212 /// Configure the words that will be classified as `TokenKind::StopWord`.
213 ///
214 /// # Arguments
215 ///
216 /// * `stop_words` - a `Set` of the words to classify as stop words.
217 pub fn stop_words(&mut self, stop_words: &'tb Set<A>) -> &mut Self {
218 self.stop_words = Some(stop_words);
219 self.normalizer_option.classifier.stop_words = self.stop_words.map(|sw| {
220 let sw = sw.as_fst().as_bytes();
221 Set::new(sw).unwrap()
222 });
223 self
224 }
225
226 /// Configure the words that will be used to separate words and classified as `TokenKind::Separator`.
227 ///
228 /// # Arguments
229 ///
230 /// * `separators` - a slice of str to classify as separator.
231 ///
232 /// # Example
233 ///
234 /// ```
235 /// use charabia::TokenizerBuilder;
236 ///
237 /// // create the builder.
238 /// let mut builder = TokenizerBuilder::default();
239 ///
240 /// // create a custom list of separators.
241 /// let separators = [" ", ", ", ". ", "?", "!"];
242 ///
243 /// // configurate separators.
244 /// builder.separators(&separators);
245 ///
246 /// // build the tokenizer passing the text to tokenize.
247 /// let tokenizer = builder.build();
248 ///
249 /// // text to tokenize.
250 /// let orig = "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!";
251 ///
252 /// let output: Vec<_> = tokenizer.segment_str(orig).collect();
253 /// assert_eq!(
254 /// &output,
255 /// &["The", " ", "quick", " ", "(\"brown\")", " ", "fox", " ", "can't", " ", "jump", " ", "32.3", " ", "feet", ", ", "right", "?", " ", "Brr", ", ", "it's", " ", "29.3°F", "!"]
256 /// );
257 /// ```
258 ///
259 pub fn separators(&mut self, separators: &'tb [&'tb str]) -> &mut Self {
260 self.normalizer_option.classifier.separators = Some(separators);
261 self
262 }
263
264 /// Configure the words that will be segmented before any other segmentation.
265 ///
266 /// This words dictionary is used to override the segmentation over these words,
267 /// the tokenizer will find all the occurences of these words before any Language based segmentation.
268 /// If some of the words are in the stop_words' list or in the separators' list,
269 /// then they will be categorized as `TokenKind::StopWord` or as `TokenKind::Separator` aswell.
270 ///
271 /// # Arguments
272 ///
273 /// * `words` - a slice of str.
274 ///
275 /// # Example
276 ///
277 /// ```
278 /// use charabia::TokenizerBuilder;
279 ///
280 /// // create the builder.
281 /// let mut builder = TokenizerBuilder::default();
282 ///
283 /// // create a custom list of words.
284 /// let words = ["J. R. R.", "Dr.", "J. K."];
285 ///
286 /// // configurate words.
287 /// builder.words_dict(&words);
288 ///
289 /// // build the tokenizer passing the text to tokenize.
290 /// let tokenizer = builder.build();
291 ///
292 /// // text to tokenize.
293 /// let orig = "J. R. R. Tolkien. J. K. Rowling. Dr. Seuss";
294 ///
295 /// let output: Vec<_> = tokenizer.segment_str(orig).collect();
296 /// assert_eq!(
297 /// &output,
298 /// &["J. R. R.", " ", "Tolkien", ". ", "J. K.", " ", "Rowling", ". ", "Dr.", " ", "Seuss"]
299 /// );
300 /// ```
301 ///
302 pub fn words_dict(&mut self, words: &'tb [&'tb str]) -> &mut Self {
303 self.words_dict = Some(words);
304 self
305 }
306
307 /// Enable or disable the creation of `char_map`.
308 ///
309 /// # Arguments
310 ///
311 /// * `create_char_map` - a `bool` that indicates whether a `char_map` should be created.
312 pub fn create_char_map(&mut self, create_char_map: bool) -> &mut Self {
313 self.normalizer_option.create_char_map = create_char_map;
314 self
315 }
316
317 /// Enable or disable the lossy normalization.
318 ///
319 /// A lossy normalization is a kind of normalization that could change the meaning in some way.
320 /// Removing diacritics is considered lossy; for instance, in French the word `maïs` (`corn`) will be normalized as `mais` (`but`) which changes the meaning.
321 ///
322 /// # Arguments
323 ///
324 /// * `lossy` - a `bool` that enable or disable the lossy normalization.
325 pub fn lossy_normalization(&mut self, lossy: bool) -> &mut Self {
326 self.normalizer_option.lossy = lossy;
327 self
328 }
329
330 /// Configure which languages can be used for which script
331 ///
332 /// # Arguments
333 ///
334 /// * `allow_list` - a `HashMap` of the selection of languages associated with a script to limit during autodetection.
335 pub fn allow_list(&mut self, allow_list: &'tb [Language]) -> &mut Self {
336 self.segmenter_option.allow_list = Some(allow_list);
337 self
338 }
339
340 /// Build the configurated `Tokenizer`.
341 pub fn build(&mut self) -> Tokenizer<'_> {
342 // If a custom list of separators or/and a custom list of words have been given,
343 // then an Aho-Corasick automaton is created to pre-segment the text during the tokenization process
344 // TODO: avoid recreating the automaton if nothing changed
345 match (self.normalizer_option.classifier.separators, self.words_dict) {
346 (Some(separators), None) => {
347 let pattern = separators.iter().filter(|s| !s.is_empty());
348 let aho = AhoCorasick::builder()
349 .match_kind(MatchKind::LeftmostLongest)
350 .build(pattern)
351 .unwrap();
352
353 self.segmenter_option.aho = Some(aho).filter(|aho| aho.patterns_len() != 0);
354 }
355 (separators, Some(words)) => {
356 // use the default separators' list if a custom words' list is given but no custom separators' list.
357 let separators = separators.unwrap_or(DEFAULT_SEPARATORS);
358 // merge both lists together and create the Aho-Corasick automaton.
359 let pattern = words.iter().chain(separators).filter(|s| !s.is_empty());
360 let aho = AhoCorasick::builder()
361 .match_kind(MatchKind::LeftmostLongest)
362 .build(pattern)
363 .unwrap();
364
365 self.segmenter_option.aho = Some(aho).filter(|aho| aho.patterns_len() != 0);
366 }
367 // reset the state in case the builder is reused.
368 (None, None) => self.segmenter_option.aho = None,
369 }
370
371 Tokenizer {
372 normalizer_option: Cow::Borrowed(&self.normalizer_option),
373 segmenter_option: Cow::Borrowed(&self.segmenter_option),
374 }
375 }
376
377 /// Build the configurated `Tokenizer` consumming self.
378 ///
379 /// This method allows to drop the tokenizer builder without having to drop the Tokenizer itself.
380 pub fn into_tokenizer(mut self) -> Tokenizer<'tb> {
381 drop(self.build());
382
383 Tokenizer {
384 normalizer_option: Cow::Owned(self.normalizer_option),
385 segmenter_option: Cow::Owned(self.segmenter_option),
386 }
387 }
388}
389
390impl Default for TokenizerBuilder<'_, Vec<u8>> {
391 fn default() -> Self {
392 Self::new()
393 }
394}
395
396#[cfg(test)]
397mod test {
398 use fst::Set;
399 use quickcheck::quickcheck;
400
401 use crate::{Tokenize, TokenizerBuilder};
402
403 #[test]
404 fn check_lifetimes() {
405 let text = "Hello world! Pleased to see you.";
406
407 let tokens: Vec<_> = { text.tokenize().collect() };
408 assert_eq!(tokens.iter().last().map(|t| t.lemma()), Some("."));
409
410 let tokens: Vec<_> = {
411 let mut builder = TokenizerBuilder::default();
412 let tokens = {
413 let tokenizer = builder.build();
414 tokenizer.tokenize(text).collect()
415 };
416 tokens
417 };
418 assert_eq!(tokens.iter().last().map(|t| t.lemma()), Some("."));
419
420 let tokens: Vec<_> = {
421 let stop_words: Set<Vec<u8>> = Set::from_iter(["to"].iter()).unwrap();
422 let mut builder = TokenizerBuilder::new();
423 let builder = builder.stop_words(&stop_words);
424 let tokens = {
425 let tokenizer = builder.build();
426 tokenizer.tokenize(text).collect()
427 };
428 tokens
429 };
430 assert_eq!(tokens.iter().last().map(|t| t.lemma()), Some("."));
431 }
432
433 #[quickcheck]
434 fn shorten_after_tokenized(text: String) -> bool {
435 let text = text.as_str();
436 let tokens: Vec<_> = text.tokenize().collect();
437 tokens.len() <= text.len()
438 }
439}