charabia/
tokenizer.rs

1use std::borrow::Cow;
2
3use aho_corasick::{AhoCorasick, MatchKind};
4use fst::Set;
5
6use crate::detection::Language;
7use crate::normalizer::{NormalizedTokenIter, NormalizerOption};
8use crate::segmenter::{Segment, SegmentedStrIter, SegmentedTokenIter, SegmenterOption};
9use crate::separators::DEFAULT_SEPARATORS;
10use crate::Token;
11
12/// Iterator over tuples of [`&str`] (part of the original text) and [`Token`].
13pub struct ReconstructedTokenIter<'o, 'aho, 'lang, 'tb> {
14    token_iter: NormalizedTokenIter<'o, 'aho, 'lang, 'tb>,
15    original: &'o str,
16}
17
18impl<'o> Iterator for ReconstructedTokenIter<'o, '_, '_, '_> {
19    type Item = (&'o str, Token<'o>);
20
21    fn next(&mut self) -> Option<Self::Item> {
22        self.token_iter
23            .next()
24            .map(|token| (&self.original[token.byte_start..token.byte_end], token))
25    }
26}
27
28/// Trait defining methods to tokenize a text.
29pub trait Tokenize<'o> {
30    /// Creates an Iterator over [`Token`]s.
31    ///
32    /// The provided text is segmented creating tokens,
33    /// then tokens are normalized and classified.
34    ///
35    /// # Example
36    ///
37    /// ```
38    /// use charabia::{Token, TokenKind, Tokenize, SeparatorKind};
39    ///
40    /// let orig = "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!";
41    ///
42    /// let mut tokens = orig.tokenize();
43    ///
44    /// let Token { lemma, kind, .. } = tokens.next().unwrap();
45    /// assert_eq!(lemma, "the");
46    /// assert_eq!(kind, TokenKind::Word);
47    ///
48    /// let Token { lemma, kind, .. } = tokens.next().unwrap();
49    /// assert_eq!(lemma, " ");
50    /// assert_eq!(kind, TokenKind::Separator(SeparatorKind::Soft));
51    ///
52    /// let Token { lemma, kind, .. } = tokens.next().unwrap();
53    /// assert_eq!(lemma, "quick");
54    /// assert_eq!(kind, TokenKind::Word);
55    /// ```
56    fn tokenize(&self) -> NormalizedTokenIter<'_, '_, '_, '_>;
57
58    /// Same as [`tokenize`] but attaches each [`Token`] to its corresponding portion of the original text.
59    ///
60    /// # Example
61    ///
62    /// ```
63    /// use charabia::{Token, TokenKind, Tokenize, SeparatorKind};
64    ///
65    /// let orig = "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!";
66    ///
67    /// let mut pairs = orig.reconstruct();
68    ///
69    /// let (s, Token { lemma, kind, .. }) = pairs.next().unwrap();
70    /// assert_eq!(s, "The");
71    /// assert_eq!(lemma, "the");
72    /// assert_eq!(kind, TokenKind::Word);
73    ///
74    /// let (s, Token { lemma, kind, .. }) = pairs.next().unwrap();
75    /// assert_eq!(s, " ");
76    /// assert_eq!(lemma, " ");
77    /// assert_eq!(kind, TokenKind::Separator(SeparatorKind::Soft));
78    ///
79    /// let (s, Token { lemma, kind, .. }) = pairs.next().unwrap();
80    /// assert_eq!(s, "quick");
81    /// assert_eq!(lemma, "quick");
82    /// assert_eq!(kind, TokenKind::Word);
83    /// ```
84    fn reconstruct(&self) -> ReconstructedTokenIter<'_, '_, '_, '_>;
85}
86
87impl Tokenize<'_> for &str {
88    fn tokenize(&self) -> NormalizedTokenIter<'_, '_, '_, '_> {
89        self.segment().normalize(&crate::normalizer::DEFAULT_NORMALIZER_OPTION)
90    }
91
92    fn reconstruct(&self) -> ReconstructedTokenIter<'_, '_, '_, '_> {
93        ReconstructedTokenIter { original: self, token_iter: self.tokenize() }
94    }
95}
96
97/// Structure used to tokenize a text with custom configurations.
98///
99/// See [`TokenizerBuilder`] to know how to build a [`Tokenizer`].
100#[derive(Debug)]
101pub struct Tokenizer<'tb> {
102    segmenter_option: Cow<'tb, SegmenterOption<'tb>>,
103    normalizer_option: Cow<'tb, NormalizerOption<'tb>>,
104}
105
106impl Tokenizer<'_> {
107    /// Creates an Iterator over [`Token`]s.
108    ///
109    /// The provided text is segmented creating tokens,
110    /// then tokens are normalized and classified depending on the list of normalizers and classifiers in [`normalizer::NORMALIZERS`].
111    pub fn tokenize<'t, 'o>(&'t self, original: &'o str) -> NormalizedTokenIter<'o, 't, 't, 't> {
112        original
113            .segment_with_option(
114                self.segmenter_option.aho.as_ref(),
115                self.segmenter_option.allow_list,
116            )
117            .normalize(&self.normalizer_option)
118    }
119
120    /// Creates an Iterator over [`Token`]s.
121    ///
122    /// The provided text is segmented creating tokens,
123    /// then tokens are normalized and classified depending on the list of normalizers and classifiers in [`normalizer::NORMALIZERS`].
124    ///
125    /// # Arguments
126    ///
127    /// * `allow_list` - a slice of [`Language`] to allow during autodetection.
128    pub fn tokenize_with_allow_list<'t, 'o, 'lang>(
129        &'t self,
130        original: &'o str,
131        allow_list: Option<&'lang [Language]>,
132    ) -> NormalizedTokenIter<'o, 't, 'lang, 't> {
133        original
134            .segment_with_option(self.segmenter_option.aho.as_ref(), allow_list)
135            .normalize(&self.normalizer_option)
136    }
137
138    /// Same as [`tokenize`] but attaches each [`Token`] to its corresponding portion of the original text.
139    pub fn reconstruct<'t, 'o>(
140        &'t self,
141        original: &'o str,
142    ) -> ReconstructedTokenIter<'o, 't, 't, 't> {
143        ReconstructedTokenIter { original, token_iter: self.tokenize(original) }
144    }
145
146    /// Segments the provided text creating an Iterator over [`Token`].
147    pub fn segment<'t, 'o>(&'t self, original: &'o str) -> SegmentedTokenIter<'o, 't, 't> {
148        original.segment_with_option(
149            self.segmenter_option.aho.as_ref(),
150            self.segmenter_option.allow_list,
151        )
152    }
153
154    /// Segments the provided text creating an Iterator over `&str`.
155    pub fn segment_str<'t, 'o>(&'t self, original: &'o str) -> SegmentedStrIter<'o, 't, 't> {
156        original.segment_str_with_option(
157            self.segmenter_option.aho.as_ref(),
158            self.segmenter_option.allow_list,
159        )
160    }
161}
162
163/// Structure to build a tokenizer with custom settings.
164///
165/// To use default settings, use directly the `Tokenize` implementation on &str.
166///
167/// # Example
168///
169/// ```
170/// use fst::Set;
171///
172/// use charabia::TokenizerBuilder;
173///
174/// // text to tokenize.
175/// let orig = "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!";
176///
177/// // create the builder.
178/// let mut builder = TokenizerBuilder::new();
179///
180/// // create a set of stop words.
181/// let stop_words: Set<Vec<u8>> = Set::from_iter(["the"].iter()).unwrap();
182///
183/// // configurate stop words.
184/// builder.stop_words(&stop_words);
185///
186/// // build the tokenizer passing the text to tokenize.
187/// let tokenizer = builder.build();
188/// ```
189///
190pub struct TokenizerBuilder<'tb, A> {
191    stop_words: Option<&'tb Set<A>>,
192    words_dict: Option<&'tb [&'tb str]>,
193    normalizer_option: NormalizerOption<'tb>,
194    segmenter_option: SegmenterOption<'tb>,
195}
196
197impl<'tb, A> TokenizerBuilder<'tb, A> {
198    /// Create a `TokenizerBuilder` with default settings,
199    ///
200    /// if you don't plan to set stop_words, prefer use [`TokenizerBuilder::default`]
201    pub fn new() -> TokenizerBuilder<'tb, A> {
202        Self {
203            normalizer_option: crate::normalizer::DEFAULT_NORMALIZER_OPTION,
204            segmenter_option: SegmenterOption::default(),
205            stop_words: None,
206            words_dict: None,
207        }
208    }
209}
210
211impl<'tb, A: AsRef<[u8]>> TokenizerBuilder<'tb, A> {
212    /// Configure the words that will be classified as `TokenKind::StopWord`.
213    ///
214    /// # Arguments
215    ///
216    /// * `stop_words` - a `Set` of the words to classify as stop words.
217    pub fn stop_words(&mut self, stop_words: &'tb Set<A>) -> &mut Self {
218        self.stop_words = Some(stop_words);
219        self.normalizer_option.classifier.stop_words = self.stop_words.map(|sw| {
220            let sw = sw.as_fst().as_bytes();
221            Set::new(sw).unwrap()
222        });
223        self
224    }
225
226    /// Configure the words that will be used to separate words and classified as `TokenKind::Separator`.
227    ///
228    /// # Arguments
229    ///
230    /// * `separators` - a slice of str to classify as separator.
231    ///
232    /// # Example
233    ///
234    /// ```
235    /// use charabia::TokenizerBuilder;
236    ///
237    /// // create the builder.
238    /// let mut builder = TokenizerBuilder::default();
239    ///
240    /// // create a custom list of separators.
241    /// let separators = [" ", ", ", ". ", "?", "!"];
242    ///
243    /// // configurate separators.
244    /// builder.separators(&separators);
245    ///
246    /// // build the tokenizer passing the text to tokenize.
247    /// let tokenizer = builder.build();
248    ///
249    /// // text to tokenize.
250    /// let orig = "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!";
251    ///
252    /// let output: Vec<_> = tokenizer.segment_str(orig).collect();
253    /// assert_eq!(
254    ///   &output,
255    ///   &["The", " ", "quick", " ", "(\"brown\")", " ", "fox", " ", "can't", " ", "jump", " ", "32.3", " ", "feet", ", ", "right", "?", " ", "Brr", ", ", "it's", " ", "29.3°F", "!"]
256    /// );
257    /// ```
258    ///
259    pub fn separators(&mut self, separators: &'tb [&'tb str]) -> &mut Self {
260        self.normalizer_option.classifier.separators = Some(separators);
261        self
262    }
263
264    /// Configure the words that will be segmented before any other segmentation.
265    ///
266    /// This words dictionary is used to override the segmentation over these words,
267    /// the tokenizer will find all the occurences of these words before any Language based segmentation.
268    /// If some of the words are in the stop_words' list or in the separators' list,
269    /// then they will be categorized as `TokenKind::StopWord` or as `TokenKind::Separator` aswell.
270    ///
271    /// # Arguments
272    ///
273    /// * `words` - a slice of str.
274    ///
275    /// # Example
276    ///
277    /// ```
278    /// use charabia::TokenizerBuilder;
279    ///
280    /// // create the builder.
281    /// let mut builder = TokenizerBuilder::default();
282    ///
283    /// // create a custom list of words.
284    /// let words = ["J. R. R.", "Dr.", "J. K."];
285    ///
286    /// // configurate words.
287    /// builder.words_dict(&words);
288    ///
289    /// // build the tokenizer passing the text to tokenize.
290    /// let tokenizer = builder.build();
291    ///
292    /// // text to tokenize.
293    /// let orig = "J. R. R. Tolkien. J. K. Rowling. Dr. Seuss";
294    ///
295    /// let output: Vec<_> = tokenizer.segment_str(orig).collect();
296    /// assert_eq!(
297    ///   &output,
298    ///   &["J. R. R.", " ", "Tolkien", ". ", "J. K.", " ", "Rowling", ". ", "Dr.", " ", "Seuss"]
299    /// );
300    /// ```
301    ///
302    pub fn words_dict(&mut self, words: &'tb [&'tb str]) -> &mut Self {
303        self.words_dict = Some(words);
304        self
305    }
306
307    /// Enable or disable the creation of `char_map`.
308    ///
309    /// # Arguments
310    ///
311    /// * `create_char_map` - a `bool` that indicates whether a `char_map` should be created.
312    pub fn create_char_map(&mut self, create_char_map: bool) -> &mut Self {
313        self.normalizer_option.create_char_map = create_char_map;
314        self
315    }
316
317    /// Enable or disable the lossy normalization.
318    ///
319    /// A lossy normalization is a kind of normalization that could change the meaning in some way.
320    /// Removing diacritics is considered lossy; for instance, in French the word `maïs` (`corn`) will be normalized as `mais` (`but`) which changes the meaning.
321    ///
322    /// # Arguments
323    ///
324    /// * `lossy` - a `bool` that enable or disable the lossy normalization.
325    pub fn lossy_normalization(&mut self, lossy: bool) -> &mut Self {
326        self.normalizer_option.lossy = lossy;
327        self
328    }
329
330    /// Configure which languages can be used for which script
331    ///
332    /// # Arguments
333    ///
334    /// * `allow_list` - a `HashMap` of the selection of languages associated with a script to limit during autodetection.
335    pub fn allow_list(&mut self, allow_list: &'tb [Language]) -> &mut Self {
336        self.segmenter_option.allow_list = Some(allow_list);
337        self
338    }
339
340    /// Build the configurated `Tokenizer`.
341    pub fn build(&mut self) -> Tokenizer<'_> {
342        // If a custom list of separators or/and a custom list of words have been given,
343        // then an Aho-Corasick automaton is created to pre-segment the text during the tokenization process
344        // TODO: avoid recreating the automaton if nothing changed
345        match (self.normalizer_option.classifier.separators, self.words_dict) {
346            (Some(separators), None) => {
347                let pattern = separators.iter().filter(|s| !s.is_empty());
348                let aho = AhoCorasick::builder()
349                    .match_kind(MatchKind::LeftmostLongest)
350                    .build(pattern)
351                    .unwrap();
352
353                self.segmenter_option.aho = Some(aho).filter(|aho| aho.patterns_len() != 0);
354            }
355            (separators, Some(words)) => {
356                // use the default separators' list if a custom words' list is given but no custom separators' list.
357                let separators = separators.unwrap_or(DEFAULT_SEPARATORS);
358                // merge both lists together and create the Aho-Corasick automaton.
359                let pattern = words.iter().chain(separators).filter(|s| !s.is_empty());
360                let aho = AhoCorasick::builder()
361                    .match_kind(MatchKind::LeftmostLongest)
362                    .build(pattern)
363                    .unwrap();
364
365                self.segmenter_option.aho = Some(aho).filter(|aho| aho.patterns_len() != 0);
366            }
367            // reset the state in case the builder is reused.
368            (None, None) => self.segmenter_option.aho = None,
369        }
370
371        Tokenizer {
372            normalizer_option: Cow::Borrowed(&self.normalizer_option),
373            segmenter_option: Cow::Borrowed(&self.segmenter_option),
374        }
375    }
376
377    /// Build the configurated `Tokenizer` consumming self.
378    ///
379    /// This method allows to drop the tokenizer builder without having to drop the Tokenizer itself.
380    pub fn into_tokenizer(mut self) -> Tokenizer<'tb> {
381        drop(self.build());
382
383        Tokenizer {
384            normalizer_option: Cow::Owned(self.normalizer_option),
385            segmenter_option: Cow::Owned(self.segmenter_option),
386        }
387    }
388}
389
390impl Default for TokenizerBuilder<'_, Vec<u8>> {
391    fn default() -> Self {
392        Self::new()
393    }
394}
395
396#[cfg(test)]
397mod test {
398    use fst::Set;
399    use quickcheck::quickcheck;
400
401    use crate::{Tokenize, TokenizerBuilder};
402
403    #[test]
404    fn check_lifetimes() {
405        let text = "Hello world! Pleased to see you.";
406
407        let tokens: Vec<_> = { text.tokenize().collect() };
408        assert_eq!(tokens.iter().last().map(|t| t.lemma()), Some("."));
409
410        let tokens: Vec<_> = {
411            let mut builder = TokenizerBuilder::default();
412            let tokens = {
413                let tokenizer = builder.build();
414                tokenizer.tokenize(text).collect()
415            };
416            tokens
417        };
418        assert_eq!(tokens.iter().last().map(|t| t.lemma()), Some("."));
419
420        let tokens: Vec<_> = {
421            let stop_words: Set<Vec<u8>> = Set::from_iter(["to"].iter()).unwrap();
422            let mut builder = TokenizerBuilder::new();
423            let builder = builder.stop_words(&stop_words);
424            let tokens = {
425                let tokenizer = builder.build();
426                tokenizer.tokenize(text).collect()
427            };
428            tokens
429        };
430        assert_eq!(tokens.iter().last().map(|t| t.lemma()), Some("."));
431    }
432
433    #[quickcheck]
434    fn shorten_after_tokenized(text: String) -> bool {
435        let text = text.as_str();
436        let tokens: Vec<_> = text.tokenize().collect();
437        tokens.len() <= text.len()
438    }
439}