text_tokenizer/
options.rs

1use crate::{SentenceBreaker, UnicodeSentenceBreaker};
2
3use std::collections::BTreeSet;
4
5pub trait IntoTokenizer: Sized {
6    type IntoTokens;
7    fn into_tokenizer<S: SentenceBreaker>(self, params: TokenizerParams<S>) -> Self::IntoTokens;
8}
9
10#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
11pub enum TokenizerOptions {
12    NoComplexTokens,
13    StructTokens,
14    SplitDot,
15    SplitUnderscore,
16    SplitColon,
17    SplitNumberSign,
18    MergePunctuation,
19    MergeWhites,
20    WithSentences,
21}
22
23pub struct TokenizerParams<S: SentenceBreaker> {
24    pub(crate) options: BTreeSet<TokenizerOptions>,
25    pub(crate) sentence_breaker: S,
26}
27impl Default for TokenizerParams<()> {
28    fn default() -> TokenizerParams<()> {
29        TokenizerParams {
30            options: BTreeSet::new(),
31            sentence_breaker: (),
32        }
33    }
34}
35impl TokenizerParams<()> {
36    pub fn v1() -> TokenizerParams<()> {
37        TokenizerParams::default()
38            .add_option(TokenizerOptions::SplitDot)
39            .add_option(TokenizerOptions::SplitUnderscore)
40            .add_option(TokenizerOptions::SplitColon)
41            .add_option(TokenizerOptions::MergeWhites)
42            .add_option(TokenizerOptions::MergePunctuation)
43    }
44    pub fn basic() -> TokenizerParams<()> {
45        TokenizerParams::default()
46            .add_option(TokenizerOptions::NoComplexTokens)
47            .add_option(TokenizerOptions::MergeWhites)
48            .add_option(TokenizerOptions::MergePunctuation)
49    }
50    pub fn complex() -> TokenizerParams<()> {
51        TokenizerParams::default()
52            .add_option(TokenizerOptions::StructTokens)
53            .add_option(TokenizerOptions::MergeWhites)
54            .add_option(TokenizerOptions::MergePunctuation)
55    }
56}
57impl<S: SentenceBreaker> TokenizerParams<S> {
58    pub fn add_option(mut self, option: TokenizerOptions) -> TokenizerParams<S> {
59        self.options.insert(option);
60        self
61    }
62    pub fn with_default_sentences(mut self) -> TokenizerParams<UnicodeSentenceBreaker> {
63        self.options.insert(TokenizerOptions::WithSentences);
64        TokenizerParams {
65            options: self.options,
66            sentence_breaker: UnicodeSentenceBreaker,
67        }
68    }
69    pub fn with_sentence_breaker<U: SentenceBreaker>(mut self, sb: U) -> TokenizerParams<U> {
70        self.options.insert(TokenizerOptions::WithSentences);
71        TokenizerParams {
72            options: self.options,
73            sentence_breaker: sb,
74        }
75    }
76}