text_tokenizer/
options.rs

1use crate::{SentenceBreaker, UnicodeSentenceBreaker};
2
3use std::collections::BTreeSet;
4
5pub trait IntoTokenizer: Sized {
6    type IntoTokens;
7    fn into_tokenizer<S: SentenceBreaker>(self, params: TokenizerParams<S>) -> Self::IntoTokens;
8}
9
10#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
11pub enum TokenizerOptions {
12    NoComplexTokens,
13    StructTokens,
14    SplitDot,
15    SplitUnderscore,
16    SplitColon,
17    SplitComa,
18    SplitSemiColon,
19    SplitNumberSign,
20    MergePunctuation,
21    MergeWhites,
22    WithSentences,
23}
24
25pub struct TokenizerParams<S: SentenceBreaker> {
26    pub(crate) options: BTreeSet<TokenizerOptions>,
27    pub(crate) sentence_breaker: S,
28}
29impl Default for TokenizerParams<()> {
30    fn default() -> TokenizerParams<()> {
31        TokenizerParams {
32            options: BTreeSet::new(),
33            sentence_breaker: (),
34        }
35    }
36}
37impl TokenizerParams<()> {
38    pub fn v1() -> TokenizerParams<()> {
39        TokenizerParams::default()
40            .add_option(TokenizerOptions::SplitDot)
41            .add_option(TokenizerOptions::SplitUnderscore)
42            .add_option(TokenizerOptions::SplitColon)
43            .add_option(TokenizerOptions::SplitSemiColon)
44            .add_option(TokenizerOptions::MergeWhites)
45            .add_option(TokenizerOptions::MergePunctuation)
46            .add_option(TokenizerOptions::StructTokens)
47    }
48    pub fn v1_1() -> TokenizerParams<()> {
49        TokenizerParams::default()
50            .add_option(TokenizerOptions::SplitDot)
51            .add_option(TokenizerOptions::SplitUnderscore)
52            .add_option(TokenizerOptions::SplitColon)
53            .add_option(TokenizerOptions::SplitComa)
54            // NumberSign???
55            .add_option(TokenizerOptions::SplitSemiColon)
56            .add_option(TokenizerOptions::MergeWhites)
57            .add_option(TokenizerOptions::MergePunctuation)
58            .add_option(TokenizerOptions::StructTokens)
59    }
60    pub fn basic() -> TokenizerParams<()> {
61        TokenizerParams::default()
62            .add_option(TokenizerOptions::NoComplexTokens)
63            .add_option(TokenizerOptions::MergeWhites)
64            .add_option(TokenizerOptions::MergePunctuation)
65    }
66    pub fn complex() -> TokenizerParams<()> {
67        TokenizerParams::default()
68            .add_option(TokenizerOptions::StructTokens)
69            .add_option(TokenizerOptions::MergeWhites)
70            .add_option(TokenizerOptions::MergePunctuation)
71    }
72}
73impl<S: SentenceBreaker> TokenizerParams<S> {
74    pub fn add_option(mut self, option: TokenizerOptions) -> TokenizerParams<S> {
75        self.options.insert(option);
76        self
77    }
78    pub fn remove_option(mut self, option: TokenizerOptions) -> TokenizerParams<S> {
79        self.options.remove(&option);
80        self
81    }
82    pub fn with_default_sentences(mut self) -> TokenizerParams<UnicodeSentenceBreaker> {
83        self.options.insert(TokenizerOptions::WithSentences);
84        TokenizerParams {
85            options: self.options,
86            sentence_breaker: UnicodeSentenceBreaker,
87        }
88    }
89    pub fn with_sentence_breaker<U: SentenceBreaker>(mut self, sb: U) -> TokenizerParams<U> {
90        self.options.insert(TokenizerOptions::WithSentences);
91        TokenizerParams {
92            options: self.options,
93            sentence_breaker: sb,
94        }
95    }
96    pub fn options(&self) -> &BTreeSet<TokenizerOptions> {
97        &self.options
98    }
99}