text_tokenizer/
options.rs1use crate::{SentenceBreaker, UnicodeSentenceBreaker};
2
3use std::collections::BTreeSet;
4
5pub trait IntoTokenizer: Sized {
6 type IntoTokens;
7 fn into_tokenizer<S: SentenceBreaker>(self, params: TokenizerParams<S>) -> Self::IntoTokens;
8}
9
10#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
11pub enum TokenizerOptions {
12 NoComplexTokens,
13 StructTokens,
14 SplitDot,
15 SplitUnderscore,
16 SplitColon,
17 SplitNumberSign,
18 MergePunctuation,
19 MergeWhites,
20 WithSentences,
21}
22
23pub struct TokenizerParams<S: SentenceBreaker> {
24 pub(crate) options: BTreeSet<TokenizerOptions>,
25 pub(crate) sentence_breaker: S,
26}
27impl Default for TokenizerParams<()> {
28 fn default() -> TokenizerParams<()> {
29 TokenizerParams {
30 options: BTreeSet::new(),
31 sentence_breaker: (),
32 }
33 }
34}
35impl TokenizerParams<()> {
36 pub fn v1() -> TokenizerParams<()> {
37 TokenizerParams::default()
38 .add_option(TokenizerOptions::SplitDot)
39 .add_option(TokenizerOptions::SplitUnderscore)
40 .add_option(TokenizerOptions::SplitColon)
41 .add_option(TokenizerOptions::MergeWhites)
42 .add_option(TokenizerOptions::MergePunctuation)
43 }
44 pub fn basic() -> TokenizerParams<()> {
45 TokenizerParams::default()
46 .add_option(TokenizerOptions::NoComplexTokens)
47 .add_option(TokenizerOptions::MergeWhites)
48 .add_option(TokenizerOptions::MergePunctuation)
49 }
50 pub fn complex() -> TokenizerParams<()> {
51 TokenizerParams::default()
52 .add_option(TokenizerOptions::StructTokens)
53 .add_option(TokenizerOptions::MergeWhites)
54 .add_option(TokenizerOptions::MergePunctuation)
55 }
56}
57impl<S: SentenceBreaker> TokenizerParams<S> {
58 pub fn add_option(mut self, option: TokenizerOptions) -> TokenizerParams<S> {
59 self.options.insert(option);
60 self
61 }
62 pub fn with_default_sentences(mut self) -> TokenizerParams<UnicodeSentenceBreaker> {
63 self.options.insert(TokenizerOptions::WithSentences);
64 TokenizerParams {
65 options: self.options,
66 sentence_breaker: UnicodeSentenceBreaker,
67 }
68 }
69 pub fn with_sentence_breaker<U: SentenceBreaker>(mut self, sb: U) -> TokenizerParams<U> {
70 self.options.insert(TokenizerOptions::WithSentences);
71 TokenizerParams {
72 options: self.options,
73 sentence_breaker: sb,
74 }
75 }
76}