cozo/fts/tokenizer/mod.rs
1/*
2 * Code under this module is adapted from the Tantivy project
3 * https://github.com/quickwit-oss/tantivy/tree/0.19.2/src/tokenizer
4 * All code here are licensed under the MIT license, as in the original project.
5 */
6
7//! Tokenizer are in charge of chopping text into a stream of tokens
8//! ready for indexing.
9//!
10//! You must define in your schema which tokenizer should be used for
11//! each of your fields :
12//!
13//! ```text
14//! use tantivy::schema::*;
15//!
16//! let mut schema_builder = Schema::builder();
17//!
18//! let text_options = TextOptions::default()
19//! .set_indexing_options(
20//! TextFieldIndexing::default()
21//! .set_tokenizer("en_stem")
22//! .set_index_option(IndexRecordOption::Basic)
23//! )
24//! .set_stored();
25//!
26//! let id_options = TextOptions::default()
27//! .set_indexing_options(
28//! TextFieldIndexing::default()
29//! .set_tokenizer("raw_ids")
30//! .set_index_option(IndexRecordOption::WithFreqsAndPositions)
31//! )
32//! .set_stored();
33//!
34//! schema_builder.add_text_field("title", text_options.clone());
35//! schema_builder.add_text_field("text", text_options);
36//! schema_builder.add_text_field("uuid", id_options);
37//!
38//! let schema = schema_builder.build();
39//! ```
40//!
41//! By default, `tantivy` offers the following tokenizers:
42//!
43//! ## `default`
44//!
45//! `default` is the tokenizer that will be used if you do not
46//! assign a specific tokenizer to your text field.
47//! It will chop your text on punctuation and whitespaces,
48//! removes tokens that are longer than 40 chars, and lowercase your text.
49//!
50//! ## `raw`
51//! Does not actual tokenizer your text. It keeps it entirely unprocessed.
52//! It can be useful to index uuids, or urls for instance.
53//!
54//! ## `en_stem`
55//!
56//! In addition to what `default` does, the `en_stem` tokenizer also
57//! apply stemming to your tokens. Stemming consists in trimming words to
58//! remove their inflection. This tokenizer is slower than the default one,
59//! but is recommended to improve recall.
60//!
61//!
62//! # Custom tokenizers
63//!
64//! You can write your own tokenizer by implementing the [`Tokenizer`] trait
65//! or you can extend an existing [`Tokenizer`] by chaining it with several
66//! [`TokenFilter`]s.
67//!
68//! For instance, the `en_stem` is defined as follows.
69//!
70//! ```text
71//! use tantivy::tokenizer::*;
72//!
73//! let en_stem = TextAnalyzer::from(SimpleTokenizer)
74//! .filter(RemoveLongFilter::limit(40))
75//! .filter(LowerCaser)
76//! .filter(Stemmer::new(Language::English));
77//! ```
78//!
79//! Once your tokenizer is defined, you need to
80//! register it with a name in your index's [`TokenizerManager`].
81//!
82//! ```text
83//! # use tantivy::schema::Schema;
84//! # use tantivy::tokenizer::*;
85//! # use tantivy::Index;
86//! #
87//! let custom_en_tokenizer = SimpleTokenizer;
88//! # let schema = Schema::builder().build();
89//! let index = Index::create_in_ram(schema);
90//! index.tokenizers()
91//! .register("custom_en", custom_en_tokenizer);
92//! ```
93//!
94//! If you built your schema programmatically, a complete example
95//! could like this for instance.
96//!
97//! Note that tokens with a len greater or equal to
98//! [`MAX_TOKEN_LEN`].
99//!
100//! # Example
101//!
102//! ```text
103//! use tantivy::schema::{Schema, IndexRecordOption, TextOptions, TextFieldIndexing};
104//! use tantivy::tokenizer::*;
105//! use tantivy::Index;
106//!
107//! let mut schema_builder = Schema::builder();
108//! let text_field_indexing = TextFieldIndexing::default()
109//! .set_tokenizer("custom_en")
110//! .set_index_option(IndexRecordOption::WithFreqsAndPositions);
111//! let text_options = TextOptions::default()
112//! .set_indexing_options(text_field_indexing)
113//! .set_stored();
114//! schema_builder.add_text_field("title", text_options);
115//! let schema = schema_builder.build();
116//! let index = Index::create_in_ram(schema);
117//!
118//! // We need to register our tokenizer :
119//! let custom_en_tokenizer = TextAnalyzer::from(SimpleTokenizer)
120//! .filter(RemoveLongFilter::limit(40))
121//! .filter(LowerCaser);
122//! index
123//! .tokenizers()
124//! .register("custom_en", custom_en_tokenizer);
125//! ```
126mod alphanum_only;
127mod ascii_folding_filter;
128mod empty_tokenizer;
129mod lower_caser;
130mod ngram_tokenizer;
131mod raw_tokenizer;
132mod remove_long;
133mod simple_tokenizer;
134mod split_compound_words;
135mod stemmer;
136mod stop_word_filter;
137mod tokenized_string;
138mod tokenizer_impl;
139mod whitespace_tokenizer;
140
141pub(crate) use self::alphanum_only::AlphaNumOnlyFilter;
142pub(crate) use self::ascii_folding_filter::AsciiFoldingFilter;
143pub(crate) use self::lower_caser::LowerCaser;
144pub(crate) use self::ngram_tokenizer::NgramTokenizer;
145pub(crate) use self::raw_tokenizer::RawTokenizer;
146pub(crate) use self::remove_long::RemoveLongFilter;
147pub(crate) use self::simple_tokenizer::SimpleTokenizer;
148pub(crate) use self::split_compound_words::SplitCompoundWords;
149pub(crate) use self::stemmer::{Language, Stemmer};
150pub(crate) use self::stop_word_filter::StopWordFilter;
151// pub(crate) use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
152pub(crate) use self::tokenizer_impl::{
153 BoxTokenFilter, BoxTokenStream, TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer,
154};
155pub(crate) use self::whitespace_tokenizer::WhitespaceTokenizer;
156
157#[cfg(test)]
158pub(crate) mod tests {
159 // use super::{
160 // Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer, Token,
161 // };
162 // use crate::fts::tokenizer::TextAnalyzer;
163
164 use crate::fts::tokenizer::Token;
165
166 /// This is a function that can be used in tests and doc tests
167 /// to assert a token's correctness.
168 pub(crate) fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) {
169 assert_eq!(
170 token.position, position,
171 "expected position {} but {:?}",
172 position, token
173 );
174 assert_eq!(token.text, text, "expected text {} but {:?}", text, token);
175 assert_eq!(
176 token.offset_from, from,
177 "expected offset_from {} but {:?}",
178 from, token
179 );
180 assert_eq!(
181 token.offset_to, to,
182 "expected offset_to {} but {:?}",
183 to, token
184 );
185 }
186}