text_tokenizer/
options.rs1use crate::{SentenceBreaker, UnicodeSentenceBreaker};
2
3use std::collections::BTreeSet;
4
5pub trait IntoTokenizer: Sized {
6 type IntoTokens;
7 fn into_tokenizer<S: SentenceBreaker>(self, params: TokenizerParams<S>) -> Self::IntoTokens;
8}
9
10#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
11pub enum TokenizerOptions {
12 NoComplexTokens,
13 StructTokens,
14 SplitDot,
15 SplitUnderscore,
16 SplitColon,
17 SplitComa,
18 SplitSemiColon,
19 SplitNumberSign,
20 MergePunctuation,
21 MergeWhites,
22 WithSentences,
23}
24
25pub struct TokenizerParams<S: SentenceBreaker> {
26 pub(crate) options: BTreeSet<TokenizerOptions>,
27 pub(crate) sentence_breaker: S,
28}
29impl Default for TokenizerParams<()> {
30 fn default() -> TokenizerParams<()> {
31 TokenizerParams {
32 options: BTreeSet::new(),
33 sentence_breaker: (),
34 }
35 }
36}
37impl TokenizerParams<()> {
38 pub fn v1() -> TokenizerParams<()> {
39 TokenizerParams::default()
40 .add_option(TokenizerOptions::SplitDot)
41 .add_option(TokenizerOptions::SplitUnderscore)
42 .add_option(TokenizerOptions::SplitColon)
43 .add_option(TokenizerOptions::SplitSemiColon)
44 .add_option(TokenizerOptions::MergeWhites)
45 .add_option(TokenizerOptions::MergePunctuation)
46 .add_option(TokenizerOptions::StructTokens)
47 }
48 pub fn v1_1() -> TokenizerParams<()> {
49 TokenizerParams::default()
50 .add_option(TokenizerOptions::SplitDot)
51 .add_option(TokenizerOptions::SplitUnderscore)
52 .add_option(TokenizerOptions::SplitColon)
53 .add_option(TokenizerOptions::SplitComa)
54 .add_option(TokenizerOptions::SplitSemiColon)
56 .add_option(TokenizerOptions::MergeWhites)
57 .add_option(TokenizerOptions::MergePunctuation)
58 .add_option(TokenizerOptions::StructTokens)
59 }
60 pub fn basic() -> TokenizerParams<()> {
61 TokenizerParams::default()
62 .add_option(TokenizerOptions::NoComplexTokens)
63 .add_option(TokenizerOptions::MergeWhites)
64 .add_option(TokenizerOptions::MergePunctuation)
65 }
66 pub fn complex() -> TokenizerParams<()> {
67 TokenizerParams::default()
68 .add_option(TokenizerOptions::StructTokens)
69 .add_option(TokenizerOptions::MergeWhites)
70 .add_option(TokenizerOptions::MergePunctuation)
71 }
72}
73impl<S: SentenceBreaker> TokenizerParams<S> {
74 pub fn add_option(mut self, option: TokenizerOptions) -> TokenizerParams<S> {
75 self.options.insert(option);
76 self
77 }
78 pub fn remove_option(mut self, option: TokenizerOptions) -> TokenizerParams<S> {
79 self.options.remove(&option);
80 self
81 }
82 pub fn with_default_sentences(mut self) -> TokenizerParams<UnicodeSentenceBreaker> {
83 self.options.insert(TokenizerOptions::WithSentences);
84 TokenizerParams {
85 options: self.options,
86 sentence_breaker: UnicodeSentenceBreaker,
87 }
88 }
89 pub fn with_sentence_breaker<U: SentenceBreaker>(mut self, sb: U) -> TokenizerParams<U> {
90 self.options.insert(TokenizerOptions::WithSentences);
91 TokenizerParams {
92 options: self.options,
93 sentence_breaker: sb,
94 }
95 }
96 pub fn options(&self) -> &BTreeSet<TokenizerOptions> {
97 &self.options
98 }
99}