cozo/fts/tokenizer/
mod.rs

1/*
2 * Code under this module is adapted from the Tantivy project
3 * https://github.com/quickwit-oss/tantivy/tree/0.19.2/src/tokenizer
4 * All code here are licensed under the MIT license, as in the original project.
5 */
6
7//! Tokenizer are in charge of chopping text into a stream of tokens
8//! ready for indexing.
9//!
10//! You must define in your schema which tokenizer should be used for
11//! each of your fields :
12//!
13//! ```text
14//! use tantivy::schema::*;
15//!
16//! let mut schema_builder = Schema::builder();
17//!
18//! let text_options = TextOptions::default()
19//!     .set_indexing_options(
20//!         TextFieldIndexing::default()
21//!             .set_tokenizer("en_stem")
22//!             .set_index_option(IndexRecordOption::Basic)
23//!     )
24//!     .set_stored();
25//!
26//! let id_options = TextOptions::default()
27//!     .set_indexing_options(
28//!         TextFieldIndexing::default()
29//!             .set_tokenizer("raw_ids")
30//!             .set_index_option(IndexRecordOption::WithFreqsAndPositions)
31//!     )
32//!     .set_stored();
33//!
34//! schema_builder.add_text_field("title", text_options.clone());
35//! schema_builder.add_text_field("text", text_options);
36//! schema_builder.add_text_field("uuid", id_options);
37//!
38//! let schema = schema_builder.build();
39//! ```
40//!
41//! By default, `tantivy` offers the following tokenizers:
42//!
43//! ## `default`
44//!
45//! `default` is the tokenizer that will be used if you do not
46//! assign a specific tokenizer to your text field.
47//! It will chop your text on punctuation and whitespaces,
48//! removes tokens that are longer than 40 chars, and lowercase your text.
49//!
50//! ## `raw`
51//! Does not actual tokenizer your text. It keeps it entirely unprocessed.
52//! It can be useful to index uuids, or urls for instance.
53//!
54//! ## `en_stem`
55//!
56//! In addition to what `default` does, the `en_stem` tokenizer also
57//! apply stemming to your tokens. Stemming consists in trimming words to
58//! remove their inflection. This tokenizer is slower than the default one,
59//! but is recommended to improve recall.
60//!
61//!
62//! # Custom tokenizers
63//!
64//! You can write your own tokenizer by implementing the [`Tokenizer`] trait
65//! or you can extend an existing [`Tokenizer`] by chaining it with several
66//! [`TokenFilter`]s.
67//!
68//! For instance, the `en_stem` is defined as follows.
69//!
70//! ```text
71//! use tantivy::tokenizer::*;
72//!
73//! let en_stem = TextAnalyzer::from(SimpleTokenizer)
74//!     .filter(RemoveLongFilter::limit(40))
75//!     .filter(LowerCaser)
76//!     .filter(Stemmer::new(Language::English));
77//! ```
78//!
79//! Once your tokenizer is defined, you need to
80//! register it with a name in your index's [`TokenizerManager`].
81//!
82//! ```text
83//! # use tantivy::schema::Schema;
84//! # use tantivy::tokenizer::*;
85//! # use tantivy::Index;
86//! #
87//! let custom_en_tokenizer = SimpleTokenizer;
88//! # let schema = Schema::builder().build();
89//! let index = Index::create_in_ram(schema);
90//! index.tokenizers()
91//!      .register("custom_en", custom_en_tokenizer);
92//! ```
93//!
94//! If you built your schema programmatically, a complete example
95//! could like this for instance.
96//!
97//! Note that tokens with a len greater or equal to
98//! [`MAX_TOKEN_LEN`].
99//!
100//! # Example
101//!
102//! ```text
103//! use tantivy::schema::{Schema, IndexRecordOption, TextOptions, TextFieldIndexing};
104//! use tantivy::tokenizer::*;
105//! use tantivy::Index;
106//!
107//! let mut schema_builder = Schema::builder();
108//! let text_field_indexing = TextFieldIndexing::default()
109//!     .set_tokenizer("custom_en")
110//!     .set_index_option(IndexRecordOption::WithFreqsAndPositions);
111//! let text_options = TextOptions::default()
112//!     .set_indexing_options(text_field_indexing)
113//!     .set_stored();
114//! schema_builder.add_text_field("title", text_options);
115//! let schema = schema_builder.build();
116//! let index = Index::create_in_ram(schema);
117//!
118//! // We need to register our tokenizer :
119//! let custom_en_tokenizer = TextAnalyzer::from(SimpleTokenizer)
120//!     .filter(RemoveLongFilter::limit(40))
121//!     .filter(LowerCaser);
122//! index
123//!     .tokenizers()
124//!     .register("custom_en", custom_en_tokenizer);
125//! ```
126mod alphanum_only;
127mod ascii_folding_filter;
128mod empty_tokenizer;
129mod lower_caser;
130mod ngram_tokenizer;
131mod raw_tokenizer;
132mod remove_long;
133mod simple_tokenizer;
134mod split_compound_words;
135mod stemmer;
136mod stop_word_filter;
137mod tokenized_string;
138mod tokenizer_impl;
139mod whitespace_tokenizer;
140
141pub(crate) use self::alphanum_only::AlphaNumOnlyFilter;
142pub(crate) use self::ascii_folding_filter::AsciiFoldingFilter;
143pub(crate) use self::lower_caser::LowerCaser;
144pub(crate) use self::ngram_tokenizer::NgramTokenizer;
145pub(crate) use self::raw_tokenizer::RawTokenizer;
146pub(crate) use self::remove_long::RemoveLongFilter;
147pub(crate) use self::simple_tokenizer::SimpleTokenizer;
148pub(crate) use self::split_compound_words::SplitCompoundWords;
149pub(crate) use self::stemmer::{Language, Stemmer};
150pub(crate) use self::stop_word_filter::StopWordFilter;
151// pub(crate) use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
152pub(crate) use self::tokenizer_impl::{
153    BoxTokenFilter, BoxTokenStream, TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer,
154};
155pub(crate) use self::whitespace_tokenizer::WhitespaceTokenizer;
156
157#[cfg(test)]
158pub(crate) mod tests {
159    // use super::{
160    //     Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer, Token,
161    // };
162    // use crate::fts::tokenizer::TextAnalyzer;
163
164    use crate::fts::tokenizer::Token;
165
166    /// This is a function that can be used in tests and doc tests
167    /// to assert a token's correctness.
168    pub(crate) fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) {
169        assert_eq!(
170            token.position, position,
171            "expected position {} but {:?}",
172            position, token
173        );
174        assert_eq!(token.text, text, "expected text {} but {:?}", text, token);
175        assert_eq!(
176            token.offset_from, from,
177            "expected offset_from {} but {:?}",
178            from, token
179        );
180        assert_eq!(
181            token.offset_to, to,
182            "expected offset_to {} but {:?}",
183            to, token
184        );
185    }
186}