use crate::{NGram, NGramType, Tokenizer};
#[derive(buffalo::Read, buffalo::Write)]
#[buffalo(size = "static", value_size = 8)]
pub enum FeatureGroup {
#[buffalo(id = 0)]
Identity(IdentityFeatureGroup),
#[buffalo(id = 1)]
Normalized(NormalizedFeatureGroup),
#[buffalo(id = 2)]
OneHotEncoded(OneHotEncodedFeatureGroup),
#[buffalo(id = 3)]
BagOfWords(BagOfWordsFeatureGroup),
#[buffalo(id = 4)]
WordEmbedding(WordEmbeddingFeatureGroup),
#[buffalo(id = 5)]
BagOfWordsCosineSimilarity(BagOfWordsCosineSimilarityFeatureGroup),
}
#[derive(buffalo::Read, buffalo::Write)]
#[buffalo(size = "dynamic")]
pub struct IdentityFeatureGroup {
#[buffalo(id = 0, required)]
pub source_column_name: String,
}
#[derive(buffalo::Read, buffalo::Write)]
#[buffalo(size = "dynamic")]
pub struct NormalizedFeatureGroup {
#[buffalo(id = 0, required)]
pub source_column_name: String,
#[buffalo(id = 1, required)]
pub mean: f32,
#[buffalo(id = 2, required)]
pub variance: f32,
}
#[derive(buffalo::Read, buffalo::Write)]
#[buffalo(size = "dynamic")]
pub struct OneHotEncodedFeatureGroup {
#[buffalo(id = 0, required)]
pub source_column_name: String,
#[buffalo(id = 1, required)]
pub variants: Vec<String>,
}
#[derive(buffalo::Read, buffalo::Write)]
#[buffalo(size = "dynamic")]
pub struct BagOfWordsFeatureGroup {
#[buffalo(id = 0, required)]
pub source_column_name: String,
#[buffalo(id = 1, required)]
pub tokenizer: Tokenizer,
#[buffalo(id = 2, required)]
pub strategy: BagOfWordsFeatureGroupStrategy,
#[buffalo(id = 3, required)]
pub ngram_types: Vec<NGramType>,
#[buffalo(id = 4, required)]
pub ngrams: Vec<(NGram, BagOfWordsFeatureGroupNGramEntry)>,
}
#[derive(buffalo::Read, buffalo::Write)]
#[buffalo(size = "dynamic")]
pub struct BagOfWordsCosineSimilarityFeatureGroup {
#[buffalo(id = 0, required)]
pub source_column_name_a: String,
#[buffalo(id = 1, required)]
pub source_column_name_b: String,
#[buffalo(id = 2, required)]
pub tokenizer: Tokenizer,
#[buffalo(id = 3, required)]
pub strategy: BagOfWordsFeatureGroupStrategy,
#[buffalo(id = 4, required)]
pub ngram_types: Vec<NGramType>,
#[buffalo(id = 5, required)]
pub ngrams: Vec<(NGram, BagOfWordsFeatureGroupNGramEntry)>,
}
#[derive(buffalo::Read, buffalo::Write)]
#[buffalo(size = "static", value_size = 0)]
pub enum BagOfWordsFeatureGroupStrategy {
#[buffalo(id = 0)]
Present,
#[buffalo(id = 1)]
Count,
#[buffalo(id = 2)]
TfIdf,
}
#[derive(buffalo::Read, buffalo::Write)]
#[buffalo(size = "dynamic")]
pub struct BagOfWordsFeatureGroupNGramEntry {
#[buffalo(id = 0, required)]
pub idf: f32,
}
#[derive(buffalo::Read, buffalo::Write)]
#[buffalo(size = "dynamic")]
pub struct WordEmbeddingFeatureGroup {
#[buffalo(id = 0, required)]
pub source_column_name: String,
#[buffalo(id = 1, required)]
pub tokenizer: Tokenizer,
#[buffalo(id = 2, required)]
pub model: WordEmbeddingModel,
}
#[derive(buffalo::Read, buffalo::Write)]
#[buffalo(size = "dynamic")]
pub struct WordEmbeddingModel {
#[buffalo(id = 0, required)]
pub size: u64,
#[buffalo(id = 1, required)]
pub words: Vec<(String, u64)>,
#[buffalo(id = 2, required)]
pub values: Vec<f32>,
}