1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
use crate::bert::BertConfig;
use crate::distilbert::DistilBertConfig;
use rust_tokenizers::{BertTokenizer, RobertaTokenizer, TokenizedInput, TruncationStrategy};
use rust_tokenizers::preprocessing::tokenizer::base_tokenizer::Tokenizer;
use std::path::Path;
use crate::Config;
use std::collections::HashMap;
use serde::{Serialize, Deserialize};
use crate::electra::ElectraConfig;
#[derive(Clone, Copy, Serialize, Deserialize)]
pub enum ModelType {
Bert,
DistilBert,
Roberta,
Electra,
}
pub enum ConfigOption {
Bert(BertConfig),
DistilBert(DistilBertConfig),
Electra(ElectraConfig),
}
pub enum TokenizerOption {
Bert(BertTokenizer),
Roberta(RobertaTokenizer),
}
impl ConfigOption {
pub fn from_file(model_type: ModelType, path: &Path) -> Self {
match model_type {
ModelType::Bert | ModelType::Roberta => ConfigOption::Bert(BertConfig::from_file(path)),
ModelType::DistilBert => ConfigOption::DistilBert(DistilBertConfig::from_file(path)),
ModelType::Electra => ConfigOption::Electra(ElectraConfig::from_file(path))
}
}
pub fn get_label_mapping(self) -> HashMap<i64, String> {
match self {
Self::Bert(config) => config.id2label.expect("No label dictionary (id2label) provided in configuration file"),
Self::DistilBert(config) => config.id2label.expect("No label dictionary (id2label) provided in configuration file"),
Self::Electra(config) => config.id2label.expect("No label dictionary (id2label) provided in configuration file"),
}
}
}
impl TokenizerOption {
pub fn from_file(model_type: ModelType, vocab_path: &str, merges_path: Option<&str>, lower_case: bool) -> Self {
match model_type {
ModelType::Bert | ModelType::DistilBert | ModelType::Electra => TokenizerOption::Bert(BertTokenizer::from_file(vocab_path, lower_case)),
ModelType::Roberta => TokenizerOption::Roberta(RobertaTokenizer::from_file(vocab_path, merges_path.expect("No merges specified!"), lower_case)),
}
}
pub fn model_type(&self) -> ModelType {
match *self {
Self::Bert(_) => ModelType::Bert,
Self::Roberta(_) => ModelType::Roberta
}
}
pub fn encode_list(&self, text_list: Vec<&str>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput> {
match *self {
Self::Bert(ref tokenizer) => tokenizer.encode_list(text_list, max_len, truncation_strategy, stride),
Self::Roberta(ref tokenizer) => tokenizer.encode_list(text_list, max_len, truncation_strategy, stride)
}
}
}