text_tokenizer/
options.rs1use crate::{SentenceBreaker, UnicodeSentenceBreaker};
2
3use std::collections::BTreeSet;
4
5pub trait IntoTokenizer: Sized {
6 type IntoTokens;
7 fn into_tokenizer<S: SentenceBreaker>(self, params: TokenizerParams<S>) -> Self::IntoTokens;
8}
9
10#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
11pub enum TokenizerOptions {
12 NoComplexTokens,
13 StructTokens,
14 SplitDot,
15 SplitUnderscore,
16 SplitColon,
17 SplitSemiColon,
18 SplitNumberSign,
19 MergePunctuation,
20 MergeWhites,
21 WithSentences,
22 NumberDefaultEnNotation,
23 NumberDefaultRuNotation,
24}
25
26pub struct TokenizerParams<S: SentenceBreaker> {
27 pub(crate) options: BTreeSet<TokenizerOptions>,
28 pub(crate) sentence_breaker: S,
29}
30impl Default for TokenizerParams<()> {
31 fn default() -> TokenizerParams<()> {
32 TokenizerParams {
33 options: BTreeSet::new(),
34 sentence_breaker: (),
35 }
36 }
37}
38impl TokenizerParams<()> {
39 pub fn v1() -> TokenizerParams<()> {
40 TokenizerParams::default()
41 .add_option(TokenizerOptions::SplitDot)
42 .add_option(TokenizerOptions::SplitUnderscore)
43 .add_option(TokenizerOptions::SplitColon)
44 .add_option(TokenizerOptions::SplitSemiColon)
45 .add_option(TokenizerOptions::MergeWhites)
46 .add_option(TokenizerOptions::MergePunctuation)
47 .add_option(TokenizerOptions::StructTokens)
48 }
49 pub fn basic() -> TokenizerParams<()> {
50 TokenizerParams::default()
51 .add_option(TokenizerOptions::NoComplexTokens)
52 .add_option(TokenizerOptions::MergeWhites)
53 .add_option(TokenizerOptions::MergePunctuation)
54 }
55 pub fn complex() -> TokenizerParams<()> {
56 TokenizerParams::default()
57 .add_option(TokenizerOptions::StructTokens)
58 .add_option(TokenizerOptions::MergeWhites)
59 .add_option(TokenizerOptions::MergePunctuation)
60 }
61}
62impl<S: SentenceBreaker> TokenizerParams<S> {
63 pub fn add_option(mut self, option: TokenizerOptions) -> TokenizerParams<S> {
64 self.options.insert(option);
65 self
66 }
67 pub fn remove_option(mut self, option: TokenizerOptions) -> TokenizerParams<S> {
68 self.options.remove(&option);
69 self
70 }
71 pub fn with_default_sentences(mut self) -> TokenizerParams<UnicodeSentenceBreaker> {
72 self.options.insert(TokenizerOptions::WithSentences);
73 TokenizerParams {
74 options: self.options,
75 sentence_breaker: UnicodeSentenceBreaker,
76 }
77 }
78 pub fn with_sentence_breaker<U: SentenceBreaker>(mut self, sb: U) -> TokenizerParams<U> {
79 self.options.insert(TokenizerOptions::WithSentences);
80 TokenizerParams {
81 options: self.options,
82 sentence_breaker: sb,
83 }
84 }
85
86 pub fn push_option(&mut self, option: TokenizerOptions) {
87 self.options.insert(option);
88 }
89
90 pub fn options(&self) -> &BTreeSet<TokenizerOptions> {
91 &self.options
92 }
93}