1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
use crate::bert::BertConfig;
use crate::distilbert::DistilBertConfig;
use crate::electra::ElectraConfig;
use crate::Config;
use rust_tokenizers::preprocessing::tokenizer::base_tokenizer::Tokenizer;
use rust_tokenizers::{BertTokenizer, RobertaTokenizer, TokenizedInput, TruncationStrategy};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::Path;
#[derive(Clone, Copy, Serialize, Deserialize)]
pub enum ModelType {
Bert,
DistilBert,
Roberta,
Electra,
}
pub enum ConfigOption {
Bert(BertConfig),
DistilBert(DistilBertConfig),
Electra(ElectraConfig),
}
pub enum TokenizerOption {
Bert(BertTokenizer),
Roberta(RobertaTokenizer),
}
impl ConfigOption {
pub fn from_file(model_type: ModelType, path: &Path) -> Self {
match model_type {
ModelType::Bert | ModelType::Roberta => ConfigOption::Bert(BertConfig::from_file(path)),
ModelType::DistilBert => ConfigOption::DistilBert(DistilBertConfig::from_file(path)),
ModelType::Electra => ConfigOption::Electra(ElectraConfig::from_file(path)),
}
}
pub fn get_label_mapping(self) -> HashMap<i64, String> {
match self {
Self::Bert(config) => config
.id2label
.expect("No label dictionary (id2label) provided in configuration file"),
Self::DistilBert(config) => config
.id2label
.expect("No label dictionary (id2label) provided in configuration file"),
Self::Electra(config) => config
.id2label
.expect("No label dictionary (id2label) provided in configuration file"),
}
}
}
impl TokenizerOption {
pub fn from_file(
model_type: ModelType,
vocab_path: &str,
merges_path: Option<&str>,
lower_case: bool,
) -> Self {
match model_type {
ModelType::Bert | ModelType::DistilBert | ModelType::Electra => {
TokenizerOption::Bert(BertTokenizer::from_file(vocab_path, lower_case))
}
ModelType::Roberta => TokenizerOption::Roberta(RobertaTokenizer::from_file(
vocab_path,
merges_path.expect("No merges specified!"),
lower_case,
)),
}
}
pub fn model_type(&self) -> ModelType {
match *self {
Self::Bert(_) => ModelType::Bert,
Self::Roberta(_) => ModelType::Roberta,
}
}
pub fn encode_list(
&self,
text_list: Vec<&str>,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize,
) -> Vec<TokenizedInput> {
match *self {
Self::Bert(ref tokenizer) => {
tokenizer.encode_list(text_list, max_len, truncation_strategy, stride)
}
Self::Roberta(ref tokenizer) => {
tokenizer.encode_list(text_list, max_len, truncation_strategy, stride)
}
}
}
}