ct2rs 0.9.18 - Docs.rs

"""Declares specification of the Transformer model."""

from typing import Optional, Tuple, Union

import numpy as np

from ctranslate2.specs import attention_spec, common_spec, model_spec


class TransformerEncoderSpec(model_spec.LayerSpec):
    def __init__(
        self,
        num_layers: int,
        num_heads: int,
        pre_norm: bool = True,
        no_final_norm: bool = False,
        activation: common_spec.Activation = common_spec.Activation.RELU,
        num_source_embeddings: int = 1,
        embeddings_merge: common_spec.EmbeddingsMerge = common_spec.EmbeddingsMerge.CONCAT,
        layernorm_embedding: bool = False,
        relative_position: bool = False,
        relative_attention_bias: bool = False,
        ffn_glu: bool = False,
        rms_norm: bool = False,
        multi_query_attention: bool = False,
        num_heads_kv: Optional[int] = None,
        head_dim: Optional[int] = None,
        rotary_dim: Optional[int] = None,
        rotary_interleave: bool = True,
        rotary_scaling_type: Optional[attention_spec.RotaryScalingType] = None,
        rotary_scaling_factor: float = 1,
        rotary_base: float = 10000,
        sliding_window: Optional[int] = None,
        qk_norm: Optional[bool] = False,
        pre_post_layer_norm: bool = False,
    ):
        """Initializes a Transformer encoder specification.

        Args:
          num_layers: Number of layers.
          num_heads: Number of attention heads.
          pre_norm: Enable the pre-norm Transformer architecture.
          no_final_norm: Disable the final layer norm in the pre-norm architecture.
          activation: Activation to apply in the feed-forward network.
          num_source_embeddings: Number of source embeddings.
          embeddings_merge: When :obj:`num_source_embeddings` > 1, specify how the
            embeddings are merged.
          layernorm_embedding: Apply layer normalization after the embedding layer.
          relative_position: Use relative position representations in the self-attention
            layers as described in https://arxiv.org/abs/1803.02155.
          relative_attention_bias: Use relative attention bias in the self-attention
            layers as described in the T5 paper https://arxiv.org/abs/1910.10683.
          ffn_glu: Use gated linear units in the FFN layers as described in
            https://arxiv.org/abs/2002.05202.
          rms_norm: Use the root mean square layer normalization.
          multi_query_attention: Use multi-query attention (alias for num_heads_kv=1).
          num_heads_kv: Number of attention heads for the key and value.
          head_dim: Number of dimensions per attention head.
          rotary_dim: Apply rotary embeddings to these first N dimensions. If 0, rotary
            embeddings are applied to all dimensions.
          rotary_interleave: Interleave the head dimensions when rotary embeddings are applied.
            Otherwise the head dimensions are sliced in half.
          rotary_scaling_type: Type of RoPE scaling.
          rotary_scaling_factor: Factor used in the RoPE scaling.
          rotary_base: The base period of the rotary embeddings.
          sliding_window: Max sequence length to retain in KV Cache.
          qk_norm: Apply layer normalization to the query and key projections.
          pre_post_layer_norm: Add post layer norm for each pre norm layer.
        """

        if multi_query_attention:
            if num_heads_kv is not None and num_heads_kv != 1:
                raise ValueError(
                    "Enabling multi_query_attention implies num_heads_kv=1"
                )
            num_heads_kv = 1

        self.multi_query_attention = multi_query_attention
        self.num_heads = np.dtype("int16").type(num_heads)
        self.pre_norm = pre_norm
        self.activation = np.dtype("int8").type(activation)
        self.embeddings_merge = np.dtype("int8").type(embeddings_merge)
        self.embeddings = [
            common_spec.EmbeddingsSpec() for _ in range(num_source_embeddings)
        ]
        self.scale_embeddings = True
        if not relative_position and not relative_attention_bias:
            self.position_encodings = PositionEncoderSpec()
        if pre_norm and not no_final_norm:
            self.layer_norm = common_spec.LayerNormSpec(rms_norm=rms_norm)
        if layernorm_embedding:
            self.layernorm_embedding = common_spec.LayerNormSpec(rms_norm=rms_norm)
        if sliding_window is not None:
            self.sliding_window = np.dtype("int32").type(sliding_window)

        self.layer = [
            TransformerEncoderLayerSpec(
                relative_position=relative_position,
                relative_attention_bias=relative_attention_bias,
                ffn_glu=ffn_glu,
                rms_norm=rms_norm,
                num_heads_kv=num_heads_kv,
                head_dim=head_dim,
                rotary_dim=rotary_dim,
                rotary_interleave=rotary_interleave,
                rotary_scaling_type=rotary_scaling_type,
                rotary_scaling_factor=rotary_scaling_factor,
                rotary_base=rotary_base,
                qk_norm=qk_norm,
                pre_post_layer_norm=pre_post_layer_norm,
            )
            for _ in range(num_layers)
        ]


class TransformerDecoderSpec(model_spec.LayerSpec):
    def __init__(
        self,
        num_layers: int,
        num_heads: int,
        pre_norm: bool = True,
        activation: common_spec.Activation = common_spec.Activation.RELU,
        layernorm_embedding: bool = False,
        with_encoder_attention: bool = True,
        no_final_norm: bool = False,
        project_in_out: bool = False,
        relative_position: bool = False,
        relative_attention_bias: bool = False,
        alignment_layer: int = -1,
        alignment_heads: int = 1,
        ffn_glu: bool = False,
        rms_norm: bool = False,
        alibi: bool = False,
        alibi_use_positive_positions: bool = False,
        scale_alibi: bool = False,
        rotary_dim: Optional[int] = None,
        rotary_interleave: bool = True,
        rotary_scaling_type: Optional[attention_spec.RotaryScalingType] = None,
        rotary_scaling_factor: float = 1,
        rotary_base: float = 10000,
        original_max_position_embeddings: int = 0,
        max_position_embeddings: int = 0,
        parallel_residual: bool = False,
        shared_layer_norm: bool = False,
        pre_post_layer_norm: bool = False,
        multi_query_attention: bool = False,
        num_heads_kv: Optional[int] = None,
        head_dim: Optional[int] = None,
        sliding_window: Optional[int] = None,
        quant_type: Optional[common_spec.Quantization] = None,
        quant_group_size: Optional[int] = None,
        quant_bits: Optional[int] = None,
        qk_norm: bool = False,
        external_pre_post_encoder_layers: Optional[bool] = False,
    ):
        """Initializes a Transformer decoder specification.

        Args:
          num_layers: Number of layers.
          num_heads: Number of attention heads.
          pre_norm: Enable the pre-norm Transformer architecture.
          activation: Activation to apply in the feed-forward network.
          layernorm_embedding: Apply layer normalization after the embedding layer.
          with_encoder_attention: Enable the encoder attention sublayers.
          no_final_norm: Disable the final layer norm in the pre-norm architecture.
          project_in_out: Add linear transformations after the embedding layer and before
            the final layer.
          relative_position: Use relative position representations in the self-attention
            layers as described in https://arxiv.org/abs/1803.02155.
          relative_attention_bias: Use relative attention bias in the self-attention
            layers as described in the T5 paper https://arxiv.org/abs/1910.10683.
          alignment_layer: Layer index selected for alignment.
          alignment_heads: Number of attention heads selected for alignment.
          ffn_glu: Use gated linear units in the FFN layers as described in
            https://arxiv.org/abs/2002.05202.
          rms_norm: Use the root mean square layer normalization.
          alibi: Use attention with linear biases.
          alibi_use_positive_positions: Use positive positions in the ALiBi definition.
          scale_alibi: Apply the dot product scale factor to ALiBi.
          rotary_dim: Apply rotary embeddings to these first N dimensions. If 0, rotary
            embeddings are applied to all dimensions.
          rotary_interleave: Interleave the head dimensions when rotary embeddings are applied.
            Otherwise the head dimensions are sliced in half.
          rotary_scaling_type: Type of RoPE scaling.
          rotary_scaling_factor: Factor used in the RoPE scaling.
          rotary_base: The base period of the rotary embeddings.
          original_max_position_embeddings: The original max position embeddings
            for Su rope embeddings
          max_position_embeddings: The max position embeddings for Su rope embeddings
          parallel_residual: Use parallel residual connections in each layer block, as used
            by the GPT-J and GPT-NeoX models.
          shared_layer_norm: When using parallel residual, share the input and post
            attention layer norms.
          pre_post_layer_norm: Add post layer norm for each pre norm layer
          multi_query_attention: Use multi-query attention (alias for num_heads_kv=1).
          num_heads_kv: Number of attention heads for the key and value.
          sliding_window: Max sequence length to retain in KV Cache.
          quant_type: quantization type used (like awq... for lower bit quantization)
          quant_group_size: group size of the lower bit quantization
          quant_bits: number of bit of the quantization (ex: 4bit)
          external_pre_post_encoder_layers: if the encoder attention pre and processing
            is done outside the attention.
        """

        self._config = dict()
        if parallel_residual:
            if not pre_norm:
                raise ValueError("The GPT-J block expects a pre-norm architecture")
            if with_encoder_attention:
                raise ValueError("The GPT-J block does not have cross attention")

        if multi_query_attention:
            if num_heads_kv is not None and num_heads_kv != 1:
                raise ValueError(
                    "Enabling multi_query_attention implies num_heads_kv=1"
                )
            num_heads_kv = 1

        self.num_heads = np.dtype("int16").type(num_heads)
        self.pre_norm = pre_norm
        self.activation = np.dtype("int8").type(activation)
        self.alignment_layer = np.dtype("int16").type(alignment_layer)
        self.alignment_heads = np.dtype("int16").type(alignment_heads)
        self.embeddings = common_spec.EmbeddingsSpec()
        self.scale_embeddings = True
        self.scale_outputs = model_spec.OPTIONAL
        self.alibi = alibi
        self.alibi_use_positive_positions = alibi_use_positive_positions
        self.scale_alibi = scale_alibi
        if sliding_window is not None:
            self.sliding_window = np.dtype("int32").type(sliding_window)
        if (
            not relative_position
            and not relative_attention_bias
            and not alibi
            and rotary_dim is None
        ):
            self.position_encodings = PositionEncoderSpec()
        if pre_norm and not no_final_norm:
            self.layer_norm = common_spec.LayerNormSpec(rms_norm=rms_norm)
        if layernorm_embedding:
            self.layernorm_embedding = common_spec.LayerNormSpec(rms_norm=rms_norm)
        self.projection = common_spec.LinearSpec()
        self.layer = [
            TransformerDecoderLayerSpec(
                with_encoder_attention=with_encoder_attention,
                relative_position=relative_position,
                relative_attention_bias=relative_attention_bias,
                ffn_glu=ffn_glu,
                rms_norm=rms_norm,
                rotary_dim=rotary_dim,
                rotary_interleave=rotary_interleave,
                rotary_scaling_type=rotary_scaling_type,
                rotary_scaling_factor=rotary_scaling_factor,
                rotary_base=rotary_base,
                original_max_position_embeddings=original_max_position_embeddings,
                max_position_embeddings=max_position_embeddings,
                parallel_residual=parallel_residual,
                shared_layer_norm=shared_layer_norm,
                pre_post_layer_norm=pre_post_layer_norm,
                num_heads_kv=num_heads_kv,
                head_dim=head_dim,
                sliding_window=sliding_window,
                qk_norm=qk_norm,
                external_pre_post_encoder_layers=external_pre_post_encoder_layers,
            )
            for _ in range(num_layers)
        ]
        self.start_from_zero_embedding = False
        self._config["multi_query_attention"] = multi_query_attention or (
            num_heads_kv != num_heads
        )

        if project_in_out:
            self.project_in = common_spec.LinearSpec()
            self.project_out = common_spec.LinearSpec()

        if quant_type:
            self._config["quantization_type"] = quant_type
            self._config["quantization_bits"] = quant_bits
            self._config["quantization_group_size"] = quant_group_size

    @property
    def config(self):
        return self._config


class TransformerEncoderLayerSpec(model_spec.LayerSpec):
    def __init__(
        self,
        relative_position=False,
        relative_attention_bias=False,
        ffn_glu=False,
        rms_norm=False,
        num_heads_kv=None,
        head_dim=None,
        sliding_window=None,
        rotary_dim: Optional[int] = None,
        rotary_interleave: bool = True,
        rotary_scaling_type: Optional[attention_spec.RotaryScalingType] = None,
        rotary_scaling_factor: float = 1,
        rotary_base: float = 10000,
        qk_norm=False,
        pre_post_layer_norm: bool = False,
    ):
        self.self_attention = attention_spec.MultiHeadAttentionSpec(
            self_attention=True,
            relative_position=relative_position,
            relative_attention_bias=relative_attention_bias,
            rms_norm=rms_norm,
            num_heads_kv=num_heads_kv,
            head_dim=head_dim,
            sliding_window=sliding_window,
            rotary_dim=rotary_dim,
            rotary_interleave=rotary_interleave,
            rotary_scaling_type=rotary_scaling_type,
            rotary_scaling_factor=rotary_scaling_factor,
            rotary_base=rotary_base,
            qk_norm=qk_norm,
        )
        self.ffn = FeedForwardSpec(glu=ffn_glu, rms_norm=rms_norm)

        if pre_post_layer_norm:
            self.input_layer_norm = common_spec.LayerNormSpec(rms_norm=rms_norm)
            self.post_attention_layer_norm = common_spec.LayerNormSpec(
                rms_norm=rms_norm
            )
            self.pre_feedforward_layer_norm = common_spec.LayerNormSpec(
                rms_norm=rms_norm
            )
            self.post_feedforward_layer_norm = common_spec.LayerNormSpec(
                rms_norm=rms_norm
            )

            delattr(self.self_attention, "layer_norm")
            delattr(self.ffn, "layer_norm")


class TransformerDecoderLayerSpec(model_spec.LayerSpec):
    def __init__(
        self,
        with_encoder_attention=True,
        relative_position=False,
        relative_attention_bias=False,
        ffn_glu=False,
        rms_norm=False,
        rotary_dim=None,
        rotary_interleave=True,
        rotary_scaling_type=None,
        rotary_scaling_factor=1,
        rotary_base=10000,
        original_max_position_embeddings=0,
        max_position_embeddings=0,
        parallel_residual=False,
        shared_layer_norm=False,
        pre_post_layer_norm=False,
        num_heads_kv=None,
        head_dim=None,
        sliding_window=None,
        qk_norm=False,
        external_pre_post_encoder_layers=False,
    ):
        self.self_attention = attention_spec.MultiHeadAttentionSpec(
            self_attention=True,
            relative_position=relative_position,
            relative_attention_bias=relative_attention_bias,
            rms_norm=rms_norm,
            rotary_dim=rotary_dim,
            rotary_interleave=rotary_interleave,
            rotary_scaling_type=rotary_scaling_type,
            rotary_scaling_factor=rotary_scaling_factor,
            rotary_base=rotary_base,
            original_max_position_embeddings=original_max_position_embeddings,
            max_position_embeddings=max_position_embeddings,
            num_heads_kv=num_heads_kv,
            head_dim=head_dim,
            sliding_window=sliding_window,
            qk_norm=qk_norm,
        )

        if with_encoder_attention:
            self.attention = attention_spec.MultiHeadAttentionSpec(
                rms_norm=rms_norm,
                num_heads_kv=num_heads_kv,
                head_dim=head_dim,
                sliding_window=sliding_window,
                qk_norm=qk_norm,
                has_norm=external_pre_post_encoder_layers is False,
            )

        self.ffn = FeedForwardSpec(glu=ffn_glu, rms_norm=rms_norm)

        if parallel_residual:
            if shared_layer_norm:
                self.shared_layer_norm = common_spec.LayerNormSpec()
            else:
                self.input_layer_norm = common_spec.LayerNormSpec()
                self.post_attention_layer_norm = common_spec.LayerNormSpec()

            delattr(self.self_attention, "layer_norm")
            delattr(self.ffn, "layer_norm")

        if pre_post_layer_norm:
            # Self-attention layer norms
            self.input_layer_norm = common_spec.LayerNormSpec(rms_norm=rms_norm)
            self.post_attention_layer_norm = common_spec.LayerNormSpec(
                rms_norm=rms_norm
            )

            if with_encoder_attention and external_pre_post_encoder_layers:
                self.external_post_encoder_attention_layer_norm = (
                    common_spec.LayerNormSpec(rms_norm=rms_norm)
                )
                self.external_pre_encoder_attention_layer_norm = (
                    common_spec.LayerNormSpec(rms_norm=rms_norm)
                )

            # Feed-forward layer norms
            self.pre_feedforward_layer_norm = common_spec.LayerNormSpec(
                rms_norm=rms_norm
            )
            self.post_feedforward_layer_norm = common_spec.LayerNormSpec(
                rms_norm=rms_norm
            )

            delattr(self.self_attention, "layer_norm")
            delattr(self.ffn, "layer_norm")


class FeedForwardSpec(model_spec.LayerSpec):
    def __init__(self, glu=False, rms_norm=False):
        self.layer_norm = common_spec.LayerNormSpec(rms_norm=rms_norm)
        self.linear_0 = common_spec.LinearSpec()
        self.linear_1 = common_spec.LinearSpec()
        if glu:
            self.linear_0_noact = common_spec.LinearSpec()


class PositionEncoderSpec(model_spec.LayerSpec):
    def __init__(self):
        self.encodings = model_spec.OPTIONAL


class TransformerConfig(model_spec.SequenceToSequenceModelConfig):
    """Configuration for Transformer models."""

    def __init__(self, layer_norm_epsilon: Optional[float] = None, **kwargs):
        """Initializes the configuration for Transformer models.

        Args:
          layer_norm_epsilon: The layer norm epsilon value.
          **kwargs: Additional configuration.
        """
        super().__init__(layer_norm_epsilon=layer_norm_epsilon, **kwargs)


class TransformerSpec(model_spec.SequenceToSequenceModelSpec):
    """Describes a Transformer model.

    The specification is invariant to hidden dimensions but requires to
    explicitly set the number of layers and attention heads.
    """

    def __init__(
        self, encoder: TransformerEncoderSpec, decoder: TransformerDecoderSpec
    ):
        """Initializes a Transformer model specification.

        Args:
          encoder: The encoder specification.
          decoder: The decoder specification.
        """
        if not isinstance(encoder, TransformerEncoderSpec):
            raise TypeError("encoder argument must be a TransformerEncoderSpec")
        if not isinstance(decoder, TransformerDecoderSpec):
            raise TypeError("decoder argument must be a TransformerDecoderSpec")

        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self._config.add_attribute(
            "multi_query_attention", self.encoder.multi_query_attention
        )

    @classmethod
    def from_config(
        cls,
        num_layers: Union[int, Tuple[int, int]],
        num_heads: int,
        with_relative_position: bool = False,
        pre_norm: bool = True,
        no_final_norm: bool = False,
        activation: common_spec.Activation = common_spec.Activation.RELU,
        alignment_layer: int = -1,
        alignment_heads: int = 1,
        num_source_embeddings: int = 1,
        embeddings_merge: common_spec.EmbeddingsMerge = common_spec.EmbeddingsMerge.CONCAT,
        layernorm_embedding: bool = False,
        relative_attention_bias: bool = False,
        ffn_glu: bool = False,
        rms_norm: bool = False,
        multi_query_attention: bool = False,
    ):
        """Creates a Transformer model specification.

        Args:
          num_layers: Number of encoder and decoder layers, or a 2-tuple if the
            number is different.
          num_heads: Number of attention heads.
          with_relative_position: Use relative position representations in the self-attention
            layers as described in https://arxiv.org/abs/1803.02155.
          pre_norm: Enable the pre-norm Transformer architecture.
          no_final_norm: Disable the final layer norm in the pre-norm architecture.
          activation: Activation to apply in the feed-forward network.
          alignment_layer: Layer index selected for alignment.
          alignment_heads: Number of attention heads selected for alignment.
          num_source_embeddings: Number of source embeddings.
          embeddings_merge: When :obj:`num_source_embeddings` > 1, specify how the
            embeddings are merged.
          layernorm_embedding: Apply layer normalization after the embedding layer.
          relative_attention_bias: Use relative attention bias in the self-attention
            layers as described in the T5 paper https://arxiv.org/abs/1910.10683.
          ffn_glu: Use gated linear units in the FFN layer as described in
            https://arxiv.org/abs/2002.05202.
          rms_norm: Use the root mean square layer normalization.
          multi_query_attention: Use multi-query attention.
        """
        if isinstance(num_layers, (list, tuple)):
            num_encoder_layers, num_decoder_layers = num_layers
        else:
            num_encoder_layers, num_decoder_layers = num_layers, num_layers

        encoder = TransformerEncoderSpec(
            num_encoder_layers,
            num_heads,
            pre_norm=pre_norm,
            no_final_norm=no_final_norm,
            activation=activation,
            num_source_embeddings=num_source_embeddings,
            embeddings_merge=embeddings_merge,
            layernorm_embedding=layernorm_embedding,
            relative_position=with_relative_position,
            relative_attention_bias=relative_attention_bias,
            ffn_glu=ffn_glu,
            rms_norm=rms_norm,
            multi_query_attention=multi_query_attention,
        )

        decoder = TransformerDecoderSpec(
            num_decoder_layers,
            num_heads,
            pre_norm=pre_norm,
            no_final_norm=no_final_norm,
            activation=activation,
            layernorm_embedding=layernorm_embedding,
            relative_position=with_relative_position,
            relative_attention_bias=relative_attention_bias,
            alignment_layer=alignment_layer,
            alignment_heads=alignment_heads,
            ffn_glu=ffn_glu,
            rms_norm=rms_norm,
            multi_query_attention=multi_query_attention,
        )

        return cls(encoder, decoder)

    @property
    def name(self):
        return "TransformerSpec"

    @property
    def revision(self):
        return 7

    def get_default_config(self):
        return TransformerConfig()

    def get_source_vocabulary_size(self):
        return [spec.weight.shape[0] for spec in self.encoder.embeddings]

    def get_target_vocabulary_size(self):
        return self.decoder.embeddings.weight.shape[0]


class TransformerDecoderModelConfig(model_spec.LanguageModelConfig):
    """Configuration for Transformer decoder models."""

    def __init__(self, layer_norm_epsilon: Optional[float] = None, **kwargs):
        """Initializes the configuration for Transformer decoder models.

        Args:
          layer_norm_epsilon: The layer norm epsilon value.
          **kwargs: Additional configuration.
        """
        super().__init__(layer_norm_epsilon=layer_norm_epsilon, **kwargs)


class TransformerDecoderModelSpec(model_spec.LanguageModelSpec):
    """Describes a Transformer decoder model (e.g. GPT-2)."""

    def __init__(self, decoder: TransformerDecoderSpec):
        """Initializes a Transformer decoder model specification.

        Args:
          decoder: The decoder specification.
        """
        if not isinstance(decoder, TransformerDecoderSpec):
            raise TypeError("decoder argument must be a TransformerDecoderSpec")

        super().__init__()
        self.decoder = decoder
        for key, value in self.decoder.config.items():
            self._config.add_attribute(key, value)

    @classmethod
    def from_config(
        cls,
        num_layers: int,
        num_heads: int,
        pre_norm: bool = True,
        activation: common_spec.Activation = common_spec.Activation.RELU,
        layernorm_embedding: bool = False,
        no_final_norm: bool = False,
        project_in_out: bool = False,
        with_relative_position: bool = False,
        ffn_glu: bool = False,
        rms_norm: bool = False,
        alibi: bool = False,
        alibi_use_positive_positions: bool = False,
        scale_alibi: bool = False,
        rotary_dim: Optional[int] = None,
        rotary_interleave: bool = True,
        rotary_scaling_type: Optional[attention_spec.RotaryScalingType] = None,
        rotary_scaling_factor: float = 1,
        rotary_base: float = 10000,
        original_max_position_embeddings: int = 0,
        max_position_embeddings: int = 0,
        parallel_residual: bool = False,
        shared_layer_norm: bool = False,
        pre_post_layer_norm: bool = False,
        multi_query_attention: bool = False,
        num_heads_kv: Optional[int] = None,
        head_dim: Optional[int] = None,
        sliding_window: Optional[int] = None,
        quant_type: Optional[common_spec.Quantization] = None,
        quant_group_size: Optional[int] = None,
        quant_bits: Optional[int] = None,
        qk_norm: bool = False,
    ):
        """Creates a Transformer decoder model specification.

        Args:
          num_layers: Number of decoder layers.
          num_heads: Number of attention heads.
          pre_norm: Enable the pre-norm Transformer architecture.
          activation: Activation to apply in the feed-forward network.
          layernorm_embedding: Apply layer normalization after the embedding layer.
          no_final_norm: Do not apply layer normalization after the last decoder block.
          project_in_out: Add a linear layer after the embedding layer and another one
            before the final output projection.
          with_relative_position: Enable relative position representations modules.
          ffn_glu: Use gated linear units in the FFN layers as described in
            https://arxiv.org/abs/2002.05202.
          rms_norm: Use the root mean square layer normalization.
          alibi: Use attention with linear biases.
          alibi_use_positive_positions: Use positive positions in the ALiBi definition.
          scale_alibi: Apply the dot product scale factor to ALiBi.
          rotary_dim: Apply rotary embeddings to these first N dimensions. If 0, rotary
            embeddings are applied to all dimensions.
          rotary_interleave: Interleave the head dimensions when rotary embeddings are applied.
            Otherwise the head dimensions are sliced in half.
          rotary_scaling_type: Type of RoPE scaling.
          rotary_scaling_factor: Factor used in the RoPE scaling.
          rotary_base: The base period of the rotary embeddings.
          original_max_position_embeddings: The original max position embeddings
            for Su rope embeddings
          max_position_embeddings: The max position embeddings for Su rope embeddings
          parallel_residual: Use parallel residual connections in each layer block, as used
            by the GPT-J and GPT-NeoX models.
          shared_layer_norm: When using parallel residual, share the input and post
            attention layer norms.
          pre_post_layer_norm: add post layer norm for each pre norm layer
          multi_query_attention: Use multi-query attention (alias for num_heads_kv=1).
          num_heads_kv: Number of attention heads for the key and value.
          head_dim: Number of head
          sliding_window: max sequence length to retain KV cache
          quant_type: quantization type used (like awq... for lower bit quantization)
          quant_group_size: group size of the lower bit quantization
          quant_bits: number of bit of the quantization (ex: 4bit)
        """
        decoder = TransformerDecoderSpec(
            num_layers,
            num_heads,
            pre_norm=pre_norm,
            activation=activation,
            layernorm_embedding=layernorm_embedding,
            with_encoder_attention=False,
            no_final_norm=no_final_norm,
            project_in_out=project_in_out,
            relative_position=with_relative_position,
            ffn_glu=ffn_glu,
            rms_norm=rms_norm,
            alibi=alibi,
            alibi_use_positive_positions=alibi_use_positive_positions,
            scale_alibi=scale_alibi,
            rotary_dim=rotary_dim,
            rotary_interleave=rotary_interleave,
            rotary_scaling_type=rotary_scaling_type,
            rotary_scaling_factor=rotary_scaling_factor,
            rotary_base=rotary_base,
            original_max_position_embeddings=original_max_position_embeddings,
            max_position_embeddings=max_position_embeddings,
            parallel_residual=parallel_residual,
            shared_layer_norm=shared_layer_norm,
            pre_post_layer_norm=pre_post_layer_norm,
            multi_query_attention=multi_query_attention,
            num_heads_kv=num_heads_kv,
            head_dim=head_dim,
            sliding_window=sliding_window,
            quant_type=quant_type,
            quant_group_size=quant_group_size,
            quant_bits=quant_bits,
            qk_norm=qk_norm,
        )

        return cls(decoder)

    @property
    def name(self):
        return "TransformerDecoderSpec"

    @property
    def revision(self):
        return 8

    def get_default_config(self):
        return TransformerDecoderModelConfig()

    def get_vocabulary_size(self):
        return self.decoder.embeddings.weight.shape[0]


class TransformerEncoderModelConfig(model_spec.LanguageModelConfig):
    """Configuration for Transformer encoder models."""

    def __init__(self, layer_norm_epsilon: Optional[float] = None, **kwargs):
        """Initializes the configuration for Transformer encoder models.

        Args:
          layer_norm_epsilon: The layer norm epsilon value.
          **kwargs: Additional configuration.
        """
        super().__init__(layer_norm_epsilon=layer_norm_epsilon, **kwargs)


class TransformerEncoderModelSpec(model_spec.LanguageModelSpec):
    """Describes a Transformer encoder model (e.g. BERT)."""

    def __init__(
        self,
        encoder: TransformerEncoderSpec,
        pooling_layer: bool = False,
        pooling_activation: common_spec.Activation = common_spec.Activation.Tanh,
    ):
        """Initializes a Transformer encoder model specification.

        Args:
          encoder: The encoder specification.
          pooling_layer: Add the pooling layer.
          pooling_activation: The activation to apply after the pooling layer.
        """
        if not isinstance(encoder, TransformerEncoderSpec):
            raise TypeError("encoder argument must be a TransformerEncoderSpec")

        super().__init__()
        self.encoder = encoder
        self._config.add_attribute(
            "multi_query_attention", self.encoder.multi_query_attention
        )

        if pooling_layer:
            self.pooler_dense = common_spec.LinearSpec()
            self.pooler_activation = np.dtype("int8").type(pooling_activation)

    @property
    def name(self):
        return "TransformerEncoderSpec"

    @property
    def revision(self):
        return 1

    def get_default_config(self):
        return TransformerEncoderModelConfig()

    def get_vocabulary_size(self):
        return self.encoder.embeddings[0].weight.shape[0]