Source code for tfts.models.gpt

"""
`Language Models are Few-Shot Learners
<https://arxiv.org/abs/2005.14165>`_
"""

from typing import Dict, Optional, Tuple

import tensorflow as tf
from tensorflow.keras.layers import Dense, Reshape

from tfts.layers.embed_layer import DataEmbedding, TokenEmbedding
from tfts.models.transformer import Encoder

from .base import BaseConfig, BaseModel


[docs] class GPTConfig(BaseConfig): model_type: str = "gpt" def __init__( self, hidden_size: int = 64, num_layers: int = 2, num_attention_heads: int = 4, ffn_intermediate_size: int = 256, hidden_act: str = "gelu", hidden_dropout_prob: float = 0.0, attention_probs_dropout_prob: float = 0.0, max_position_embeddings: int = 512, type_vocab_size: int = 2, initializer_range: float = 0.02, layer_norm_eps: float = 1e-12, pad_token_id: int = 0, positional_type: str = "absolute", use_cache: bool = True, dense_units: Tuple[int] = (512, 1024), classifier_dropout: Optional[float] = None, **kwargs: Dict[str, object], ) -> None: """Configuration class for GPT decoder model, inheriting from BaseConfig. Args: hidden_size: The size of the hidden layers. Default is 64. num_hidden_layers: The number of hidden layers in the transformer encoder. Default is 2. num_attention_heads: The number of attention heads in each attention layer. Default is 4. ffn_intermediate_size: The size of the intermediate (feed-forward) layer. Default is 256. hidden_act: The activation function for hidden layers. Default is "gelu". hidden_dropout_prob: The dropout probability for hidden layers. Default is 0.1. attention_probs_dropout_prob: The dropout probability for attention probabilities. Default is 0.1. max_position_embeddings: The maximum length of the input sequences. Default is 512. type_vocab_size: The vocabulary size for token types (usually 2). Default is 2. initializer_range: The standard deviation for weight initialization. Default is 0.02. layer_norm_eps: The epsilon value for layer normalization. Default is 1e-12. pad_token_id: The ID for the padding token. Default is 0. positional_type: The type of position embedding ("absolute" or "relative"). Default is "absolute". use_cache: Whether to use the cache during inference. Default is True. classifier_dropout: Dropout probability for the classifier layer. Default is None. **kwargs: Additional keyword arguments passed to the parent `BaseConfig` class. """ super().__init__(**kwargs) self.hidden_size: int = hidden_size self.num_layers: int = num_layers self.num_attention_heads: int = num_attention_heads self.ffn_intermediate_size: int = ffn_intermediate_size self.hidden_act: str = hidden_act self.hidden_dropout_prob: float = hidden_dropout_prob self.attention_probs_dropout_prob: float = attention_probs_dropout_prob self.max_position_embeddings: int = max_position_embeddings self.type_vocab_size: int = type_vocab_size self.initializer_range: float = initializer_range self.layer_norm_eps: float = layer_norm_eps self.positional_type: str = positional_type self.use_cache: bool = use_cache self.dense_unites: Tuple[int] = dense_units self.classifier_dropout: Optional[float] = classifier_dropout self.pad_token_id: int = pad_token_id def __post_init__(self): """Validate configuration parameters.""" if self.hidden_size <= 0: raise ValueError(f"hidden_size must be positive, got {self.hidden_size}") if self.num_layers <= 0: raise ValueError(f"num_layers must be positive, got {self.num_layers}") if self.num_attention_heads <= 0: raise ValueError(f"num_attention_heads must be positive, got {self.num_attention_heads}") if not 0 <= self.attention_probs_dropout_prob < 1: raise ValueError(f"attention_probs_dropout_prob must be in [0, 1), got {self.attention_probs_dropout_prob}") if not 0 <= self.hidden_dropout_prob < 1: raise ValueError(f"hidden_dropout_prob must be in [0, 1), got {self.hidden_dropout_prob}") if self.hidden_size % self.num_attention_heads != 0: raise ValueError( f"hidden_size must be divisible by attention_heads, got {self.hidden_size}/{self.num_attention_heads}" )
[docs] class GPT(BaseModel): """GPT decoder model for time series""" def __init__(self, predict_sequence_length: int = 1, config: Optional[GPTConfig] = None) -> None: super(GPT, self).__init__() self.config = config or GPTConfig() self.predict_sequence_length = predict_sequence_length self.encoder_embedding = TokenEmbedding(self.config.hidden_size) self.encoder = Encoder( num_hidden_layers=self.config.num_layers, hidden_size=self.config.hidden_size, num_attention_heads=self.config.num_attention_heads, attention_probs_dropout_prob=self.config.attention_probs_dropout_prob, ffn_intermediate_size=self.config.ffn_intermediate_size, hidden_dropout_prob=self.config.hidden_dropout_prob, ) self.dense_layers = [] for unit in self.config.dense_unites: self.dense_layers.append(Dense(unit, activation="relu")) self.projection = Dense(predict_sequence_length, activation=None) def __call__( self, inputs: tf.Tensor, teacher: Optional[tf.Tensor] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> tf.Tensor: """GPT model forward pass. Args: inputs: Input time-series data, can be: - A single tensor of shape [batch_size, seq_len, feature_dim] - A tuple/list of (x, encoder_feature, decoder_feature) - A dictionary with keys 'x' and 'encoder_feature' teacher: Optional teacher forcing tensor for autoregression. training: Whether the model is in training mode. mask: Optional attention mask. output_hidden_states: Whether to return hidden states. return_dict: Whether to return outputs as a dictionary. Returns: If return_dict is False and output_hidden_states is False: Forecasted values tensor of shape [batch_size, predict_sequence_length, input_dim] If output_hidden_states is True: Hidden states from the encoder. If return_dict is True: Dictionary containing model outputs. """ if isinstance(inputs, (list, tuple)): x, encoder_feature, decoder_feature = inputs encoder_feature = tf.concat([x, encoder_feature], axis=-1) elif isinstance(inputs, dict): x = inputs["x"] encoder_feature = inputs["encoder_feature"] encoder_feature = tf.concat([x, encoder_feature], axis=-1) else: encoder_feature = x = inputs encoder_feature = self.encoder_embedding(encoder_feature) memory = self.encoder(encoder_feature, mask=None) if output_hidden_states: # (batch_size, train_sequence_length, hidden_size) return memory encoder_output = memory[:, -1] for layer in self.dense_layers: encoder_output = layer(encoder_output) outputs = self.projection(encoder_output) outputs = Reshape((outputs.shape[1], 1))(outputs) return outputs