frits/
analysis.rs

1//! Analyze phrases to split into tokens.
2
3/// Metadata that may be emitted by tokenizers.
4pub trait Metadata
5where
6    Self: dyn_clone::DynClone,
7{
8}
9
10dyn_clone::clone_trait_object!(Metadata);
11
12impl Metadata for () {}
13
14/// A span describes a view into a string in terms of indices.
15#[derive(Clone, Copy, Debug)]
16pub struct Span {
17    /// Start offset, in bytes.
18    pub start: usize,
19
20    /// Length, in bytes.
21    pub length: usize,
22}
23
24impl Span {
25    /// Indexes a string using this span.
26    pub fn index<'a>(&self, s: &'a str) -> &'a str {
27        &s[self.start..self.end()]
28    }
29
30    /// Gets the end of this span.
31    pub const fn end(&self) -> usize {
32        self.start + self.length
33    }
34}
35
36/// A token is the smallest unit of text that will be indexed for searching.
37#[derive(Clone)]
38pub struct Token<'a> {
39    /// The span of the token in the original text.
40    pub src_span: Span,
41
42    /// The text. It may not match the word in original phrase as it may have been normalized in some way (e.g. stemming).
43    pub text: std::borrow::Cow<'a, str>,
44
45    /// Additional metadata that may be emitted from the tokenizer.
46    pub metadata: Box<dyn Metadata + 'a>,
47}
48
49impl<'a> Token<'a> {
50    /// Applies a mapping function over the text and returns the new token.
51    pub fn map_text(self, f: impl for<'b> Fn(&'b str) -> std::borrow::Cow<'b, str>) -> Self {
52        Self {
53            text: match &self.text {
54                std::borrow::Cow::Borrowed(text) => f(text),
55                std::borrow::Cow::Owned(text) => f(text).into_owned().into(),
56            },
57            ..self
58        }
59    }
60}
61
62/// A tokenizer turns a phrase into a list of tokens.
63pub trait Tokenizer {
64    /// Tokenizes the input, returning an iterator over the tokens.
65    fn tokenize<'a>(&'a self, s: &'a str) -> impl Iterator<Item = Token<'a>>;
66
67    /// Filters the output stream using a given [`Filter`].
68    fn filter<F>(self, filter: F) -> impl Tokenizer
69    where
70        Self: Sized,
71        F: Filter,
72    {
73        FilterTokenizer {
74            tokenizer: self,
75            filter,
76        }
77    }
78
79    /// Creates a boxed version of the tokenizer.
80    fn boxed(self) -> Box<dyn DynTokenizer>
81    where
82        Self: Sized + 'static,
83    {
84        Box::new(self)
85    }
86}
87
88/// A boxable version of [`Tokenizer`].
89pub trait DynTokenizer {
90    fn tokenize<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Token<'a>> + 'a>;
91}
92
93impl<T> DynTokenizer for T
94where
95    T: Tokenizer,
96{
97    fn tokenize<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Token<'a>> + 'a> {
98        Box::new(Tokenizer::tokenize(self, s))
99    }
100}
101
102struct FilterTokenizer<T, F> {
103    tokenizer: T,
104    filter: F,
105}
106
107impl<T, F> Tokenizer for FilterTokenizer<T, F>
108where
109    T: Tokenizer,
110    F: Filter,
111{
112    fn tokenize<'a>(&'a self, s: &'a str) -> impl Iterator<Item = Token<'a>> {
113        self.filter.apply(self.tokenizer.tokenize(s))
114    }
115}
116
117/// Splits phrases by Unicode word boundaries, as defined by [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
118pub struct UnicodeWordsTokenizer;
119
120impl Tokenizer for UnicodeWordsTokenizer {
121    fn tokenize<'a>(&'a self, s: &'a str) -> impl Iterator<Item = Token<'a>> {
122        use unicode_segmentation::UnicodeSegmentation as _;
123
124        s.unicode_word_indices().map(|(start, word)| Token {
125            src_span: Span {
126                start,
127                length: word.len(),
128            },
129            text: std::borrow::Cow::Borrowed(word),
130            metadata: Box::new(()),
131        })
132    }
133}
134
135/// Splits phrases by Unicode whitespace characters, as defined according to the terms of the Unicode Derived Core Property `White_Space`.
136pub struct WhitespaceTokenizer;
137
138impl Tokenizer for WhitespaceTokenizer {
139    fn tokenize<'a>(&'a self, s: &'a str) -> impl Iterator<Item = Token<'a>> {
140        s.split_whitespace().map(move |word| Token {
141            src_span: Span {
142                start: word.as_ptr() as usize - s.as_ptr() as usize,
143                length: word.len(),
144            },
145            text: std::borrow::Cow::Borrowed(word),
146            metadata: Box::new(()),
147        })
148    }
149}
150
151#[cfg(feature = "chinese")]
152mod chinese {
153    use super::*;
154
155    /// Splits phrases using Chinese word segmentation.
156    ///
157    /// This uses [`jieba_rs::Jieba`].
158    pub struct ChineseTokenizer {
159        jieba: jieba_rs::Jieba,
160    }
161
162    impl ChineseTokenizer {
163        /// Creates a new Chinese tokenizer.
164        pub fn new() -> Self {
165            Self::from_jieba(jieba_rs::Jieba::new())
166        }
167
168        /// Creates a new Chinese tokenizer using the given [`jieba_rs::Jieba`] instance.
169        pub fn from_jieba(jieba: jieba_rs::Jieba) -> Self {
170            Self { jieba }
171        }
172    }
173
174    impl Tokenizer for ChineseTokenizer {
175        fn tokenize<'a>(&'a self, s: &'a str) -> impl Iterator<Item = Token<'a>> {
176            self.jieba
177                .tokenize(s, jieba_rs::TokenizeMode::Default, false)
178                .into_iter()
179                .map(|token| Token {
180                    src_span: Span {
181                        start: token.start,
182                        length: token.end - token.start,
183                    },
184                    text: std::borrow::Cow::Borrowed(token.word),
185                    metadata: Box::new(()),
186                })
187        }
188    }
189}
190
191#[cfg(feature = "chinese")]
192pub use chinese::*;
193
194#[cfg(feature = "japanese-korean")]
195mod jako {
196    use super::*;
197
198    /// Splits phrases using Japanese/Korean word segmentation.
199    ///
200    /// Each token emitted by this will also contain [`JaKoTokenMetadata`] in its [`Token::metadata`].
201    ///
202    /// This uses [`lindera::tokenizer::Tokenizer`].
203    pub struct JaKoTokenizer {
204        segmenter: lindera::segmenter::Segmenter,
205    }
206
207    impl JaKoTokenizer {
208        /// Creates a new Japanese/Korean tokenizer using the given [`lindera::dictionary::Dictionary`] and default segmenter settings.
209        pub fn from_dictionary(
210            dictionary: lindera::dictionary::Dictionary,
211            user_dictionary: Option<lindera::dictionary::UserDictionary>,
212        ) -> Self {
213            Self::from_segmenter(lindera::segmenter::Segmenter::new(
214                lindera::mode::Mode::Normal,
215                dictionary,
216                user_dictionary,
217            ))
218        }
219
220        /// Creates a new Japanese/Korean tokenizer using the given [`lindera::segmenter::Segmenter`].
221        pub fn from_segmenter(segmenter: lindera::segmenter::Segmenter) -> Self {
222            Self { segmenter }
223        }
224    }
225
226    /// Extra metadata for tokens emitted by [`JaKoTokenizer`].
227    #[derive(Clone)]
228    pub struct JaKoTokenMetadata<'a> {
229        /// Word details from the dictionary.
230        pub details: Option<Vec<std::borrow::Cow<'a, str>>>,
231    }
232
233    impl<'a> Metadata for JaKoTokenMetadata<'a> {}
234
235    impl Tokenizer for JaKoTokenizer {
236        fn tokenize<'a>(&'a self, s: &'a str) -> impl Iterator<Item = Token<'a>> {
237            Box::new(
238                self.segmenter
239                    .segment(std::borrow::Cow::Borrowed(s))
240                    .into_iter()
241                    .flat_map(|tokens| {
242                        tokens.into_iter().map(|token| Token {
243                            src_span: Span {
244                                start: token.byte_start,
245                                length: token.byte_end - token.byte_start,
246                            },
247                            text: token.text,
248                            metadata: Box::new(JaKoTokenMetadata {
249                                details: token.details,
250                            }),
251                        })
252                    }),
253            )
254        }
255    }
256}
257
258#[cfg(feature = "japanese-korean")]
259pub use jako::*;
260
261/// A filter modifies a token stream and can either modify, add, or delete tokens.
262pub trait Filter {
263    /// Applies the filter to the token, returning a list of new tokens to replace the current token with.
264    fn apply<'a>(&self, tokens: impl Iterator<Item = Token<'a>>)
265        -> impl Iterator<Item = Token<'a>>;
266}
267
268#[cfg(feature = "stemming")]
269mod stemming {
270    use super::*;
271
272    /// A filter that stems tokens.
273    pub struct StemmingFilter {
274        stemmer: rust_stemmers::Stemmer,
275    }
276
277    impl StemmingFilter {
278        /// Create a filter that stems tokens using the given algorithm.
279        pub fn new(algorithm: rust_stemmers::Algorithm) -> Self {
280            Self {
281                stemmer: rust_stemmers::Stemmer::create(algorithm),
282            }
283        }
284    }
285
286    impl Filter for StemmingFilter {
287        fn apply<'a>(
288            &self,
289            tokens: impl Iterator<Item = Token<'a>>,
290        ) -> impl Iterator<Item = Token<'a>> {
291            tokens.map(|token| token.map_text(|text| self.stemmer.stem(text)))
292        }
293    }
294}
295
296#[cfg(feature = "stemming")]
297pub use stemming::*;
298
299/// A filter that lowercases.
300pub struct LowercaseFilter;
301
302impl Filter for LowercaseFilter {
303    fn apply<'a>(
304        &self,
305        tokens: impl Iterator<Item = Token<'a>>,
306    ) -> impl Iterator<Item = Token<'a>> {
307        tokens.map(|token| token.map_text(|text| text.to_lowercase().into()))
308    }
309}
310
311/// A filter that lowercases for Turkish or Azerbaijani.
312///
313/// It will map İ to i and I to ı, e.g. MEKSİKALI will become meksikalı.
314pub struct TrAzLowercaseFilter;
315
316impl Filter for TrAzLowercaseFilter {
317    fn apply<'a>(
318        &self,
319        tokens: impl Iterator<Item = Token<'a>>,
320    ) -> impl Iterator<Item = Token<'a>> {
321        tokens.map(|token| {
322            token.map_text(|text| {
323                text.chars()
324                    .map(|c| match c {
325                        'İ' => "i".to_string(),
326                        'I' => "ı".to_string(),
327                        c => c.to_lowercase().to_string(),
328                    })
329                    .collect::<String>()
330                    .into()
331            })
332        })
333    }
334}
335
336/// A filter that uppercases.
337pub struct UppercaseFilter;
338
339impl Filter for UppercaseFilter {
340    fn apply<'a>(
341        &self,
342        tokens: impl Iterator<Item = Token<'a>>,
343    ) -> impl Iterator<Item = Token<'a>> {
344        tokens.map(|token| token.map_text(|text| text.to_uppercase().into()))
345    }
346}
347
348/// A filter that uppercases for Turkish or Azerbaijani.
349///
350/// It will map i to İ and ı to I, e.g. meksikalı will become MEKSİKALI.
351pub struct TrAzUppercaseFilter;
352
353impl Filter for TrAzUppercaseFilter {
354    fn apply<'a>(
355        &self,
356        tokens: impl Iterator<Item = Token<'a>>,
357    ) -> impl Iterator<Item = Token<'a>> {
358        tokens.map(|token| {
359            token.map_text(|text| {
360                text.chars()
361                    .map(|c| match c {
362                        'i' => "İ".to_string(),
363                        c => c.to_uppercase().to_string(),
364                    })
365                    .collect::<String>()
366                    .into()
367            })
368        })
369    }
370}
371
372/// A filter that removes metadata.
373pub struct StripMetadataFilter;
374
375impl Filter for StripMetadataFilter {
376    fn apply<'a>(
377        &self,
378        tokens: impl Iterator<Item = Token<'a>>,
379    ) -> impl Iterator<Item = Token<'a>> {
380        tokens.map(|token| Token {
381            metadata: Box::new(()),
382            ..token
383        })
384    }
385}
386
387#[cfg(test)]
388mod test {
389    use super::*;
390
391    #[test]
392    fn test_tr_az_lowercase_filter() {
393        assert_eq!(
394            TrAzLowercaseFilter
395                .apply(
396                    vec![Token {
397                        src_span: Span {
398                            start: 0,
399                            length: 1
400                        },
401                        text: "MEKSİKALI".into(),
402                        metadata: Box::new(()),
403                    }]
404                    .into_iter()
405                )
406                .map(|token| token.text)
407                .collect::<Vec<_>>(),
408            &["meksikalı"]
409        );
410    }
411
412    #[test]
413    fn test_tr_az_uppercase_filter() {
414        assert_eq!(
415            TrAzUppercaseFilter
416                .apply(
417                    vec![Token {
418                        src_span: Span {
419                            start: 0,
420                            length: 1
421                        },
422                        text: "meksikalı".into(),
423                        metadata: Box::new(()),
424                    }]
425                    .into_iter()
426                )
427                .map(|token| token.text)
428                .collect::<Vec<_>>(),
429            &["MEKSİKALI"]
430        );
431    }
432}