1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
use crate::{NGram, NGramType, Tokenizer};
#[derive(buffalo::Read, buffalo::Write)]
#[buffalo(size = "static", value_size = 8)]
pub enum FeatureGroup {
#[buffalo(id = 0)]
Identity(IdentityFeatureGroup),
#[buffalo(id = 1)]
Normalized(NormalizedFeatureGroup),
#[buffalo(id = 2)]
OneHotEncoded(OneHotEncodedFeatureGroup),
#[buffalo(id = 3)]
BagOfWords(BagOfWordsFeatureGroup),
#[buffalo(id = 4)]
WordEmbedding(WordEmbeddingFeatureGroup),
#[buffalo(id = 5)]
BagOfWordsCosineSimilarity(BagOfWordsCosineSimilarityFeatureGroup),
}
#[derive(buffalo::Read, buffalo::Write)]
#[buffalo(size = "dynamic")]
pub struct IdentityFeatureGroup {
#[buffalo(id = 0, required)]
pub source_column_name: String,
}
#[derive(buffalo::Read, buffalo::Write)]
#[buffalo(size = "dynamic")]
pub struct NormalizedFeatureGroup {
#[buffalo(id = 0, required)]
pub source_column_name: String,
#[buffalo(id = 1, required)]
pub mean: f32,
#[buffalo(id = 2, required)]
pub variance: f32,
}
#[derive(buffalo::Read, buffalo::Write)]
#[buffalo(size = "dynamic")]
pub struct OneHotEncodedFeatureGroup {
#[buffalo(id = 0, required)]
pub source_column_name: String,
#[buffalo(id = 1, required)]
pub variants: Vec<String>,
}
#[derive(buffalo::Read, buffalo::Write)]
#[buffalo(size = "dynamic")]
pub struct BagOfWordsFeatureGroup {
#[buffalo(id = 0, required)]
pub source_column_name: String,
#[buffalo(id = 1, required)]
pub tokenizer: Tokenizer,
#[buffalo(id = 2, required)]
pub strategy: BagOfWordsFeatureGroupStrategy,
#[buffalo(id = 3, required)]
pub ngram_types: Vec<NGramType>,
#[buffalo(id = 4, required)]
pub ngrams: Vec<(NGram, BagOfWordsFeatureGroupNGramEntry)>,
}
#[derive(buffalo::Read, buffalo::Write)]
#[buffalo(size = "dynamic")]
pub struct BagOfWordsCosineSimilarityFeatureGroup {
#[buffalo(id = 0, required)]
pub source_column_name_a: String,
#[buffalo(id = 1, required)]
pub source_column_name_b: String,
#[buffalo(id = 2, required)]
pub tokenizer: Tokenizer,
#[buffalo(id = 3, required)]
pub strategy: BagOfWordsFeatureGroupStrategy,
#[buffalo(id = 4, required)]
pub ngram_types: Vec<NGramType>,
#[buffalo(id = 5, required)]
pub ngrams: Vec<(NGram, BagOfWordsFeatureGroupNGramEntry)>,
}
#[derive(buffalo::Read, buffalo::Write)]
#[buffalo(size = "static", value_size = 0)]
pub enum BagOfWordsFeatureGroupStrategy {
#[buffalo(id = 0)]
Present,
#[buffalo(id = 1)]
Count,
#[buffalo(id = 2)]
TfIdf,
}
#[derive(buffalo::Read, buffalo::Write)]
#[buffalo(size = "dynamic")]
pub struct BagOfWordsFeatureGroupNGramEntry {
#[buffalo(id = 0, required)]
pub idf: f32,
}
#[derive(buffalo::Read, buffalo::Write)]
#[buffalo(size = "dynamic")]
pub struct WordEmbeddingFeatureGroup {
#[buffalo(id = 0, required)]
pub source_column_name: String,
#[buffalo(id = 1, required)]
pub tokenizer: Tokenizer,
#[buffalo(id = 2, required)]
pub model: WordEmbeddingModel,
}
#[derive(buffalo::Read, buffalo::Write)]
#[buffalo(size = "dynamic")]
pub struct WordEmbeddingModel {
#[buffalo(id = 0, required)]
pub size: u64,
#[buffalo(id = 1, required)]
pub words: Vec<(String, u64)>,
#[buffalo(id = 2, required)]
pub values: Vec<f32>,
}