#![allow(dead_code)]
#![allow(unused_imports)]
use crate::albert::config::AlbertConfig;
use crate::albert::model::AlbertModel;
use std::io::Read;
use trustformers_core::device::Device;
use trustformers_core::errors::Result;
use trustformers_core::layers::Linear;
use trustformers_core::tensor::Tensor;
use trustformers_core::traits::{Layer, Model, TokenizedInput};
#[derive(Debug, Clone)]
pub struct AlbertForSequenceClassification {
albert: AlbertModel,
classifier: Linear,
#[allow(dead_code)]
num_labels: usize,
device: Device,
}
#[derive(Debug, Clone)]
pub struct AlbertForTokenClassification {
albert: AlbertModel,
classifier: Linear,
#[allow(dead_code)]
num_labels: usize,
device: Device,
}
#[derive(Debug, Clone)]
pub struct AlbertForQuestionAnswering {
albert: AlbertModel,
qa_outputs: Linear,
device: Device,
}
#[derive(Debug, Clone)]
pub struct AlbertForMaskedLM {
albert: AlbertModel,
predictions: AlbertMLMHead,
device: Device,
}
#[derive(Debug, Clone)]
pub struct AlbertMLMHead {
dense: Linear,
layer_norm: trustformers_core::layers::LayerNorm,
decoder: Linear,
bias: Tensor,
device: Device,
}
#[derive(Debug)]
pub struct AlbertSequenceClassifierOutput {
pub logits: Tensor,
pub hidden_states: Option<Tensor>,
pub attentions: Option<Vec<Tensor>>,
}
#[derive(Debug)]
pub struct AlbertTokenClassifierOutput {
pub logits: Tensor,
pub hidden_states: Option<Tensor>,
pub attentions: Option<Vec<Tensor>>,
}
#[derive(Debug)]
pub struct AlbertForQuestionAnsweringOutput {
pub start_logits: Tensor,
pub end_logits: Tensor,
pub hidden_states: Option<Tensor>,
pub attentions: Option<Vec<Tensor>>,
}
#[derive(Debug)]
pub struct AlbertMaskedLMOutput {
pub logits: Tensor,
pub hidden_states: Option<Tensor>,
pub attentions: Option<Vec<Tensor>>,
}
impl AlbertForSequenceClassification {
pub fn new(config: AlbertConfig, num_labels: usize) -> Result<Self> {
Self::new_with_device(config, num_labels, Device::CPU)
}
pub fn new_with_device(
config: AlbertConfig,
num_labels: usize,
device: Device,
) -> Result<Self> {
let albert = AlbertModel::new_with_device(config.clone(), device)?;
let _classifier_dropout =
config.classifier_dropout_prob.unwrap_or(config.hidden_dropout_prob);
let classifier = Linear::new_with_device(config.hidden_size, num_labels, true, device);
Ok(Self {
albert,
classifier,
num_labels,
device,
})
}
pub fn device(&self) -> Device {
self.device
}
}
impl Model for AlbertForSequenceClassification {
type Config = AlbertConfig;
type Input = TokenizedInput;
type Output = AlbertSequenceClassifierOutput;
fn forward(&self, input: Self::Input) -> Result<Self::Output> {
let albert_output = self.albert.forward(input)?;
let pooled_output = albert_output.pooler_output.ok_or_else(|| {
trustformers_core::errors::TrustformersError::model_error(
"Pooler output is required for sequence classification".to_string(),
)
})?;
let logits = self.classifier.forward(pooled_output)?;
Ok(AlbertSequenceClassifierOutput {
logits,
hidden_states: Some(albert_output.last_hidden_state),
attentions: albert_output.attentions,
})
}
fn load_pretrained(&mut self, reader: &mut dyn Read) -> Result<()> {
self.albert.load_pretrained(reader)
}
fn get_config(&self) -> &Self::Config {
self.albert.get_config()
}
fn num_parameters(&self) -> usize {
let config = self.albert.get_config();
let embedding_params = config.vocab_size * config.embedding_size
+ config.max_position_embeddings * config.embedding_size
+ config.type_vocab_size * config.embedding_size
+ config.embedding_size * 2;
let projection_params = config.embedding_size * config.hidden_size + config.hidden_size;
let attention_params_per_layer =
4 * (config.hidden_size * config.hidden_size + config.hidden_size); let ffn_params_per_layer = config.hidden_size * config.intermediate_size + config.intermediate_size + config.intermediate_size * config.hidden_size + config.hidden_size; let layer_norm_params = 4 * config.hidden_size;
let params_per_layer =
attention_params_per_layer + ffn_params_per_layer + layer_norm_params;
let encoder_params = config.num_hidden_groups * config.inner_group_num * params_per_layer;
let pooler_params = config.hidden_size * config.hidden_size + config.hidden_size;
let classifier_params = config.hidden_size * self.num_labels + self.num_labels;
embedding_params + projection_params + encoder_params + pooler_params + classifier_params
}
}
impl AlbertForTokenClassification {
pub fn new(config: AlbertConfig, num_labels: usize) -> Result<Self> {
Self::new_with_device(config, num_labels, Device::CPU)
}
pub fn new_with_device(
config: AlbertConfig,
num_labels: usize,
device: Device,
) -> Result<Self> {
let albert = AlbertModel::new_with_device(config.clone(), device)?;
let classifier = Linear::new_with_device(config.hidden_size, num_labels, true, device);
Ok(Self {
albert,
classifier,
num_labels,
device,
})
}
pub fn device(&self) -> Device {
self.device
}
}
impl Model for AlbertForTokenClassification {
type Config = AlbertConfig;
type Input = TokenizedInput;
type Output = AlbertTokenClassifierOutput;
fn forward(&self, input: Self::Input) -> Result<Self::Output> {
let albert_output = self.albert.forward(input)?;
let logits = self.classifier.forward(albert_output.last_hidden_state.clone())?;
Ok(AlbertTokenClassifierOutput {
logits,
hidden_states: Some(albert_output.last_hidden_state),
attentions: albert_output.attentions,
})
}
fn load_pretrained(&mut self, reader: &mut dyn Read) -> Result<()> {
self.albert.load_pretrained(reader)
}
fn get_config(&self) -> &Self::Config {
self.albert.get_config()
}
fn num_parameters(&self) -> usize {
let config = self.albert.get_config();
let embedding_params = config.vocab_size * config.embedding_size
+ config.max_position_embeddings * config.embedding_size
+ config.type_vocab_size * config.embedding_size
+ config.embedding_size * 2;
let projection_params = config.embedding_size * config.hidden_size + config.hidden_size;
let params_per_layer = 4 * (config.hidden_size * config.hidden_size + config.hidden_size)
+ config.hidden_size * config.intermediate_size
+ config.intermediate_size
+ config.intermediate_size * config.hidden_size
+ config.hidden_size
+ 4 * config.hidden_size;
let encoder_params = config.num_hidden_groups * config.inner_group_num * params_per_layer;
let pooler_params = config.hidden_size * config.hidden_size + config.hidden_size;
let classifier_params = config.hidden_size * self.num_labels + self.num_labels;
embedding_params + projection_params + encoder_params + pooler_params + classifier_params
}
}
impl AlbertForQuestionAnswering {
pub fn new(config: AlbertConfig) -> Result<Self> {
Self::new_with_device(config, Device::CPU)
}
pub fn new_with_device(config: AlbertConfig, device: Device) -> Result<Self> {
let albert = AlbertModel::new_with_device(config.clone(), device)?;
let qa_outputs = Linear::new_with_device(config.hidden_size, 2, true, device);
Ok(Self {
albert,
qa_outputs,
device,
})
}
pub fn device(&self) -> Device {
self.device
}
}
impl Model for AlbertForQuestionAnswering {
type Config = AlbertConfig;
type Input = TokenizedInput;
type Output = AlbertForQuestionAnsweringOutput;
fn forward(&self, input: Self::Input) -> Result<Self::Output> {
let albert_output = self.albert.forward(input)?;
let logits = self.qa_outputs.forward(albert_output.last_hidden_state.clone())?;
let _batch_size = logits.shape()[0];
let _sequence_length = logits.shape()[1];
let start_logits = logits.slice(2, 0, 1)?;
let end_logits = logits.slice(2, 1, 2)?;
Ok(AlbertForQuestionAnsweringOutput {
start_logits,
end_logits,
hidden_states: Some(albert_output.last_hidden_state),
attentions: albert_output.attentions,
})
}
fn load_pretrained(&mut self, reader: &mut dyn Read) -> Result<()> {
self.albert.load_pretrained(reader)
}
fn get_config(&self) -> &Self::Config {
self.albert.get_config()
}
fn num_parameters(&self) -> usize {
let config = self.albert.get_config();
let embedding_params = config.vocab_size * config.embedding_size
+ config.max_position_embeddings * config.embedding_size
+ config.type_vocab_size * config.embedding_size
+ config.embedding_size * 2;
let projection_params = config.embedding_size * config.hidden_size + config.hidden_size;
let params_per_layer = 4 * (config.hidden_size * config.hidden_size + config.hidden_size)
+ config.hidden_size * config.intermediate_size
+ config.intermediate_size
+ config.intermediate_size * config.hidden_size
+ config.hidden_size
+ 4 * config.hidden_size;
let encoder_params = config.num_hidden_groups * config.inner_group_num * params_per_layer;
let pooler_params = config.hidden_size * config.hidden_size + config.hidden_size;
let qa_params = config.hidden_size * 2 + 2;
embedding_params + projection_params + encoder_params + pooler_params + qa_params
}
}
impl AlbertMLMHead {
fn new(config: &AlbertConfig) -> Result<Self> {
Self::new_with_device(config, Device::CPU)
}
fn new_with_device(config: &AlbertConfig, device: Device) -> Result<Self> {
let dense =
Linear::new_with_device(config.hidden_size, config.embedding_size, true, device);
let layer_norm = trustformers_core::layers::LayerNorm::new_with_device(
vec![config.embedding_size],
config.layer_norm_eps,
device,
)?;
let decoder =
Linear::new_with_device(config.embedding_size, config.vocab_size, false, device);
let bias = Tensor::zeros(&[config.vocab_size])?;
Ok(Self {
dense,
layer_norm,
decoder,
bias,
device,
})
}
fn device(&self) -> Device {
self.device
}
fn forward(&self, hidden_states: Tensor) -> Result<Tensor> {
let hidden_states = self.dense.forward(hidden_states)?;
let hidden_states = match "gelu" {
"gelu" => trustformers_core::ops::activations::gelu(&hidden_states)?,
"relu" => trustformers_core::ops::activations::relu(&hidden_states)?,
_ => hidden_states,
};
let hidden_states = self.layer_norm.forward(hidden_states)?;
let hidden_states = self.decoder.forward(hidden_states)?;
let hidden_states = hidden_states.add(&self.bias)?;
Ok(hidden_states)
}
}
impl AlbertForMaskedLM {
pub fn new(config: AlbertConfig) -> Result<Self> {
Self::new_with_device(config, Device::CPU)
}
pub fn new_with_device(config: AlbertConfig, device: Device) -> Result<Self> {
let albert = AlbertModel::new_with_device(config.clone(), device)?;
let predictions = AlbertMLMHead::new_with_device(&config, device)?;
Ok(Self {
albert,
predictions,
device,
})
}
pub fn device(&self) -> Device {
self.device
}
}
impl Model for AlbertForMaskedLM {
type Config = AlbertConfig;
type Input = TokenizedInput;
type Output = AlbertMaskedLMOutput;
fn forward(&self, input: Self::Input) -> Result<Self::Output> {
let albert_output = self.albert.forward(input)?;
let logits = self.predictions.forward(albert_output.last_hidden_state.clone())?;
Ok(AlbertMaskedLMOutput {
logits,
hidden_states: Some(albert_output.last_hidden_state),
attentions: albert_output.attentions,
})
}
fn load_pretrained(&mut self, reader: &mut dyn Read) -> Result<()> {
self.albert.load_pretrained(reader)
}
fn get_config(&self) -> &Self::Config {
self.albert.get_config()
}
fn num_parameters(&self) -> usize {
let config = self.albert.get_config();
let embedding_params = config.vocab_size * config.embedding_size
+ config.max_position_embeddings * config.embedding_size
+ config.type_vocab_size * config.embedding_size
+ config.embedding_size * 2;
let projection_params = config.embedding_size * config.hidden_size + config.hidden_size;
let params_per_layer = 4 * (config.hidden_size * config.hidden_size + config.hidden_size)
+ config.hidden_size * config.intermediate_size
+ config.intermediate_size
+ config.intermediate_size * config.hidden_size
+ config.hidden_size
+ 4 * config.hidden_size;
let encoder_params = config.num_hidden_groups * config.inner_group_num * params_per_layer;
let pooler_params = config.hidden_size * config.hidden_size + config.hidden_size;
let mlm_head_params = config.hidden_size * config.embedding_size + config.embedding_size + config.embedding_size * 2 + config.embedding_size * config.vocab_size + config.vocab_size;
embedding_params + projection_params + encoder_params + pooler_params + mlm_head_params
}
}
#[cfg(test)]
mod tests {
use crate::albert::config::AlbertConfig;
use trustformers_core::traits::Config;
struct Lcg {
state: u64,
}
impl Lcg {
fn new(seed: u64) -> Self {
Lcg { state: seed }
}
fn next(&mut self) -> u64 {
self.state = self
.state
.wrapping_mul(6364136223846793005u64)
.wrapping_add(1442695040888963407u64);
self.state
}
fn next_f32(&mut self) -> f32 {
(self.next() >> 11) as f32 / (1u64 << 53) as f32
}
}
fn make_small_config() -> AlbertConfig {
AlbertConfig {
vocab_size: 1000,
embedding_size: 64,
hidden_size: 128,
num_hidden_layers: 2,
num_hidden_groups: 1,
num_attention_heads: 4,
intermediate_size: 256,
inner_group_num: 1,
hidden_act: "gelu".to_string(),
hidden_dropout_prob: 0.0,
attention_probs_dropout_prob: 0.0,
max_position_embeddings: 64,
type_vocab_size: 2,
initializer_range: 0.02,
layer_norm_eps: 1e-12,
classifier_dropout_prob: None,
position_embedding_type: "absolute".to_string(),
pad_token_id: 0,
bos_token_id: 2,
eos_token_id: 3,
}
}
#[test]
fn test_albert_config_default_validates() {
let cfg = AlbertConfig::default();
assert!(cfg.validate().is_ok());
}
#[test]
fn test_albert_config_base_v1() {
let cfg = AlbertConfig::albert_base_v1();
assert_eq!(cfg.vocab_size, 30000);
assert_eq!(cfg.hidden_size, 768);
assert_eq!(cfg.num_attention_heads, 12);
assert!(cfg.validate().is_ok());
}
#[test]
fn test_albert_config_base_v2() {
let cfg = AlbertConfig::albert_base_v2();
assert_eq!(cfg.hidden_size, 768);
assert_eq!(cfg.embedding_size, 128);
assert!(cfg.validate().is_ok());
}
#[test]
fn test_albert_config_large() {
let cfg = AlbertConfig::albert_large_v2();
assert_eq!(cfg.hidden_size, 1024);
assert_eq!(cfg.num_hidden_layers, 24);
assert!(cfg.validate().is_ok());
}
#[test]
fn test_albert_config_xlarge() {
let cfg = AlbertConfig::albert_xlarge_v2();
assert_eq!(cfg.hidden_size, 2048);
assert!(cfg.validate().is_ok());
}
#[test]
fn test_albert_config_xxlarge() {
let cfg = AlbertConfig::albert_xxlarge_v2();
assert_eq!(cfg.hidden_size, 4096);
assert!(cfg.validate().is_ok());
}
#[test]
fn test_architecture_name() {
let cfg = AlbertConfig::default();
assert_eq!(cfg.architecture(), "albert");
}
#[test]
fn test_small_config_hidden_divisible_by_heads() {
let cfg = make_small_config();
assert_eq!(cfg.hidden_size % cfg.num_attention_heads, 0);
}
#[test]
fn test_small_config_validate_passes() {
let cfg = make_small_config();
assert!(cfg.validate().is_ok());
}
#[test]
fn test_hidden_not_divisible_fails() {
let cfg = AlbertConfig {
hidden_size: 100,
num_attention_heads: 12,
..AlbertConfig::default()
};
assert!(cfg.validate().is_err());
}
#[test]
fn test_embedding_size_field_exists() {
let cfg = AlbertConfig::default();
assert!(cfg.embedding_size > 0);
assert!(cfg.embedding_size <= cfg.hidden_size);
}
#[test]
fn test_inner_group_num_field() {
let cfg = AlbertConfig::default();
assert!(cfg.inner_group_num >= 1);
}
#[test]
fn test_num_hidden_groups_default() {
let cfg = AlbertConfig::default();
assert_eq!(cfg.num_hidden_groups, 1);
}
#[test]
fn test_classifier_dropout_default_none() {
let cfg = AlbertConfig::default();
assert!(cfg.classifier_dropout_prob.is_none());
}
#[test]
fn test_from_pretrained_name_base_v2() {
let cfg = AlbertConfig::from_pretrained_name("albert-base-v2");
assert_eq!(cfg.hidden_size, 768);
}
#[test]
fn test_from_pretrained_name_large_v2() {
let cfg = AlbertConfig::from_pretrained_name("albert-large-v2");
assert_eq!(cfg.hidden_size, 1024);
}
#[test]
fn test_num_parameters_increases_with_size() {
let base = AlbertConfig::albert_base_v2();
let large = AlbertConfig::albert_large_v2();
assert!(large.hidden_size > base.hidden_size);
}
#[test]
fn test_lcg_produces_range() {
let mut rng = Lcg::new(31415);
for _ in 0..100 {
let v = rng.next_f32();
assert!((0.0..1.0).contains(&v));
}
}
#[test]
fn test_position_embedding_type_default() {
let cfg = AlbertConfig::default();
assert_eq!(cfg.position_embedding_type, "absolute");
}
#[test]
fn test_max_position_embeddings_default() {
let cfg = AlbertConfig::default();
assert_eq!(cfg.max_position_embeddings, 512);
}
#[test]
fn test_type_vocab_size_default() {
let cfg = AlbertConfig::default();
assert_eq!(cfg.type_vocab_size, 2);
}
#[test]
fn test_pad_token_id_default() {
let cfg = AlbertConfig::default();
assert_eq!(cfg.pad_token_id, 0);
}
}