text_tokenizer/
options.rs

1use crate::{SentenceBreaker, UnicodeSentenceBreaker};
2
3use std::collections::BTreeSet;
4
5pub trait IntoTokenizer: Sized {
6    type IntoTokens;
7    fn into_tokenizer<S: SentenceBreaker>(self, params: TokenizerParams<S>) -> Self::IntoTokens;
8}
9
10#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
11pub enum TokenizerOptions {
12    NoComplexTokens,
13    StructTokens,
14    SplitDot,
15    SplitUnderscore,
16    SplitColon,
17    SplitSemiColon,
18    SplitNumberSign,
19    MergePunctuation,
20    MergeWhites,
21    WithSentences,
22    NumberDefaultEnNotation,
23    NumberDefaultRuNotation,
24}
25
26pub struct TokenizerParams<S: SentenceBreaker> {
27    pub(crate) options: BTreeSet<TokenizerOptions>,
28    pub(crate) sentence_breaker: S,
29}
30impl Default for TokenizerParams<()> {
31    fn default() -> TokenizerParams<()> {
32        TokenizerParams {
33            options: BTreeSet::new(),
34            sentence_breaker: (),
35        }
36    }
37}
38impl TokenizerParams<()> {
39    pub fn v1() -> TokenizerParams<()> {
40        TokenizerParams::default()
41            .add_option(TokenizerOptions::SplitDot)
42            .add_option(TokenizerOptions::SplitUnderscore)
43            .add_option(TokenizerOptions::SplitColon)
44            .add_option(TokenizerOptions::SplitSemiColon)
45            .add_option(TokenizerOptions::MergeWhites)
46            .add_option(TokenizerOptions::MergePunctuation)
47            .add_option(TokenizerOptions::StructTokens)
48    }
49    pub fn basic() -> TokenizerParams<()> {
50        TokenizerParams::default()
51            .add_option(TokenizerOptions::NoComplexTokens)
52            .add_option(TokenizerOptions::MergeWhites)
53            .add_option(TokenizerOptions::MergePunctuation)
54    }
55    pub fn complex() -> TokenizerParams<()> {
56        TokenizerParams::default()
57            .add_option(TokenizerOptions::StructTokens)
58            .add_option(TokenizerOptions::MergeWhites)
59            .add_option(TokenizerOptions::MergePunctuation)
60    }
61}
62impl<S: SentenceBreaker> TokenizerParams<S> {
63    pub fn add_option(mut self, option: TokenizerOptions) -> TokenizerParams<S> {
64        self.options.insert(option);
65        self
66    }
67    pub fn remove_option(mut self, option: TokenizerOptions) -> TokenizerParams<S> {
68        self.options.remove(&option);
69        self
70    }
71    pub fn with_default_sentences(mut self) -> TokenizerParams<UnicodeSentenceBreaker> {
72        self.options.insert(TokenizerOptions::WithSentences);
73        TokenizerParams {
74            options: self.options,
75            sentence_breaker: UnicodeSentenceBreaker,
76        }
77    }
78    pub fn with_sentence_breaker<U: SentenceBreaker>(mut self, sb: U) -> TokenizerParams<U> {
79        self.options.insert(TokenizerOptions::WithSentences);
80        TokenizerParams {
81            options: self.options,
82            sentence_breaker: sb,
83        }
84    }
85
86    pub fn push_option(&mut self, option: TokenizerOptions) {
87        self.options.insert(option);
88    }
89
90    pub fn options(&self) -> &BTreeSet<TokenizerOptions> {
91        &self.options
92    }
93}