Source code for tfts.layers.position_layer

"""Layer for :py:class:`~tfts.models.transformer`"""

from typing import Dict, Optional, Tuple, Union

import numpy as np
import tensorflow as tf


[docs] class PositionalEmbedding(tf.keras.layers.Layer): """Positional embedding layer that adds positional information to input embeddings. This layer implements the sinusoidal positional encoding as described in the paper "Attention Is All You Need" (Vaswani et al., 2017). It adds positional information to the input embeddings using sine and cosine functions of different frequencies. Args: max_len (int, optional): Maximum sequence length. Defaults to 5000. name (str, optional): Layer name. Defaults to None. Input shape: - 3D tensor with shape `(batch_size, sequence_length, embedding_dim)` Output shape: - 3D tensor with shape `(batch_size, sequence_length, embedding_dim)` """ def __init__(self, max_len: int = 5000, name: Optional[str] = None): super(PositionalEmbedding, self).__init__(name=name) self.max_len = max_len self.position_enc = None
[docs] def build(self, input_shape: Tuple[Optional[int], ...]) -> None: """Build the layer by pre-computing the positional encodings. Args: input_shape: Shape of the input tensor """ super(PositionalEmbedding, self).build(input_shape) E = input_shape[-1] # embedding dimension # Pre-compute positional encodings position_enc = np.array( [[pos / np.power(10000, (i - i % 2) / E) for i in range(E)] for pos in range(self.max_len)] ) position_enc[:, 0::2] = np.sin(position_enc[:, 0::2]) position_enc[:, 1::2] = np.cos(position_enc[:, 1::2]) self.position_enc = tf.convert_to_tensor(position_enc, dtype=tf.float32)
[docs] def call(self, x: tf.Tensor, masking: bool = True) -> tf.Tensor: """Applies positional encoding to the input tensor. Args: x: Input tensor of shape (batch_size, sequence_length, embedding_dim) masking: If True, applies masking to the output tensor. Defaults to True. Returns: Output tensor of the same shape as the input tensor, after applying positional encoding. """ batch_size, seq_length = tf.shape(x)[0], tf.shape(x)[1] # Get position indices for each sequence position_ind = tf.tile(tf.expand_dims(tf.range(seq_length), 0), [batch_size, 1]) # Lookup positional encodings outputs = tf.nn.embedding_lookup(self.position_enc, position_ind) # Apply masking if requested if masking: outputs = tf.where(tf.equal(x, 0), x, outputs) return tf.cast(outputs, tf.float32)
[docs] def get_config(self) -> Dict[str, Union[int, str]]: """Get layer configuration. Returns: Dictionary containing layer configuration. """ config = {"max_len": self.max_len} base_config = super(PositionalEmbedding, self).get_config() return dict(list(base_config.items()) + list(config.items()))
def compute_output_shape(self, input_shape: Tuple[int, int, int]) -> Tuple[int, int, int]: return input_shape
[docs] class PositionalEncoding(tf.keras.layers.Layer): """Positional encoding layer that adds positional information to input embeddings. This layer implements a more efficient version of positional encoding that computes the encodings on-the-fly using matrix operations. It's particularly useful for variable-length sequences as it doesn't require pre-computing encodings for all possible positions. Args: max_len (int, optional): Maximum sequence length. Defaults to 5000. name (str, optional): Layer name. Defaults to None. Input shape: - 3D tensor with shape `(batch_size, sequence_length, embedding_dim)` Output shape: - 3D tensor with shape `(batch_size, sequence_length, embedding_dim)` """ def __init__(self, max_len: int = 5000, name: Optional[str] = None): super(PositionalEncoding, self).__init__(name=name) self.max_len = max_len
[docs] def build(self, input_shape: Tuple[Optional[int], ...]) -> None: """Build the layer. Args: input_shape: Shape of the input tensor """ super(PositionalEncoding, self).build(input_shape)
[docs] def call(self, x: tf.Tensor, masking: bool = True) -> tf.Tensor: """Applies positional encoding to the input tensor. Args: x: Input tensor of shape (batch_size, sequence_length, embedding_dim) masking: If True, applies masking to the output tensor. Defaults to True. Returns: Output tensor of the same shape as the input tensor, after applying positional encoding. """ d_model = x.get_shape().as_list()[-1] # embedding dimension depth = d_model // 2 batch_size, seq_length = tf.shape(x)[0], tf.shape(x)[1] with tf.name_scope("positional_encode"): # Create position indices positions = tf.range(seq_length, dtype=tf.float32)[..., tf.newaxis] # (seq_length, 1) # Create depth indices depths = tf.range(depth, dtype=tf.float32)[tf.newaxis, :] / depth # (1, depth) # Calculate angle rates angle_rates = 1 / tf.pow(10000.0, depths) # (1, depth) # Calculate angle radians angle_rads = tf.matmul(positions, angle_rates) # (seq_length, depth) # Create positional encodings position_enc = tf.concat([tf.sin(angle_rads), tf.cos(angle_rads)], axis=-1) # (seq_length, d_model) # Expand for batch dimension position_enc = tf.expand_dims(position_enc, 0) # (1, seq_length, d_model) position_enc = tf.tile(position_enc, [batch_size, 1, 1]) # (batch_size, seq_length, d_model) # Apply masking if requested if masking: position_enc = tf.where(tf.equal(x, 0), x, position_enc) return position_enc
[docs] def get_config(self) -> Dict[str, Union[int, str]]: """Get layer configuration. Returns: Dictionary containing layer configuration. """ config = {"max_len": self.max_len} base_config = super(PositionalEncoding, self).get_config() return dict(list(base_config.items()) + list(config.items()))
def compute_output_shape(self, input_shape: Tuple[int, int, int]) -> Tuple[int, int, int]: return input_shape
[docs] class RelativePositionEmbedding(tf.keras.layers.Layer): """Relative position embedding layer that adds relative positional information. This layer implements relative position embeddings as described in the paper "Self-Attention with Relative Position Representations" (Shaw et al., 2018). It learns embeddings for relative positions between query and key positions. Args: max_len (int, optional): Maximum sequence length. Defaults to 512. output_dim (int, optional): Output embedding dimension. Defaults to 512. name (str, optional): Layer name. Defaults to None. Input shape: - Tuple of two tensors: - Query tensor of shape `(batch_size, query_length, embedding_dim)` - Value tensor of shape `(batch_size, value_length, embedding_dim)` Output shape: - Tensor of shape `(batch_size, query_length, value_length, output_dim)` """ def __init__(self, max_len: int = 512, output_dim: int = 512, name: Optional[str] = None): super(RelativePositionEmbedding, self).__init__(name=name) self.max_len = max_len self.output_dim = output_dim self.input_dim = max_len * 2 - 1 # Total number of relative positions
[docs] def build(self, input_shape: Tuple[Tuple[Optional[int], ...], Tuple[Optional[int], ...]]) -> None: """Build the layer by creating the embedding weights. Args: input_shape: Shape of the input tensors """ super(RelativePositionEmbedding, self).build(input_shape) self.embedding_initializer = tf.keras.initializers.get("zeros") self.embeddings = self.add_weight( name="relative_position_embeddings", shape=(self.input_dim, self.output_dim), initializer=self.embedding_initializer, trainable=True, )
[docs] def call(self, inputs: Tuple[tf.Tensor, tf.Tensor]) -> tf.Tensor: """Applies relative position embeddings to the input tensors. Args: inputs: Tuple of (query_tensor, value_tensor) where: - query_tensor: Shape (batch_size, query_length, embedding_dim) - value_tensor: Shape (batch_size, value_length, embedding_dim) Returns: Tensor of shape (batch_size, query_length, value_length, output_dim) containing relative position embeddings. """ q, v = inputs q_length = tf.shape(q)[1] v_length = tf.shape(v)[1] # Create position indices q_idx = tf.range(q_length, dtype=tf.int32)[:, tf.newaxis] # (q_length, 1) v_idx = tf.range(v_length, dtype=tf.int32)[tf.newaxis, :] # (1, v_length) # Calculate relative positions position_idx = v_idx - q_idx # (q_length, v_length) # Clip positions to valid range max_position = (self.input_dim - 1) // 2 position_idx = tf.clip_by_value(position_idx, -max_position, max_position) # Shift to positive indices position_idx = position_idx + max_position # Lookup embeddings embeddings = tf.gather(self.embeddings, position_idx) # (q_length, v_length, output_dim) # Add batch dimension batch_size = tf.shape(q)[0] embeddings = tf.expand_dims(embeddings, 0) # (1, q_length, v_length, output_dim) embeddings = tf.tile(embeddings, [batch_size, 1, 1, 1]) # (batch_size, q_length, v_length, output_dim) return embeddings
[docs] def get_config(self) -> Dict[str, Union[int, str]]: """Get layer configuration. Returns: Dictionary containing layer configuration. """ config = {"max_len": self.max_len, "output_dim": self.output_dim} base_config = super(RelativePositionEmbedding, self).get_config() return dict(list(base_config.items()) + list(config.items()))
def compute_output_shape( self, input_shape: Tuple[Tuple[int, int, int], Tuple[int, int, int]] ) -> Tuple[int, int, int, int]: query_shape, value_shape = input_shape return (query_shape[0], query_shape[1], value_shape[1], self.output_dim)
[docs] class RotaryPositionEmbedding(tf.keras.layers.Layer): """Rotary position embedding layer that adds rotary positional information. This layer implements rotary position embeddings (RoPE) as described in the paper "RoFormer: Enhanced Transformer with Rotary Position Embedding" (Su et al., 2021). It applies a rotation to the input embeddings based on their positions. Args: dim (int): Dimension of the input embeddings. name (str, optional): Layer name. Defaults to None. Input shape: - 3D tensor with shape `(batch_size, sequence_length, embedding_dim)` Output shape: - 3D tensor with shape `(batch_size, sequence_length, embedding_dim)` """ def __init__(self, dim: int, name: Optional[str] = None): super().__init__(name=name) self.dim = dim
[docs] def build(self, input_shape: Tuple[Optional[int], ...]) -> None: """Build the layer. Args: input_shape: Shape of the input tensor """ super().build(input_shape)
[docs] def call(self, inputs: tf.Tensor, cache_key: Optional[tf.Tensor] = None) -> tf.Tensor: """Applies rotary position embeddings to the input tensor. Args: inputs: Input tensor of shape (batch_size, sequence_length, embedding_dim) cache_key: Optional tensor for caching position information. Defaults to None. Returns: Output tensor of the same shape as the input tensor, after applying rotary position embeddings. """ batch_size, seq_length = tf.shape(inputs)[0], tf.shape(inputs)[1] # Create position indices positions = tf.range(seq_length, dtype=tf.float32)[:, tf.newaxis] # (seq_length, 1) # Create dimension indices dims = tf.range(self.dim // 2, dtype=tf.float32)[tf.newaxis, :] # (1, dim/2) # Calculate angle rates angle_rates = 1 / tf.pow(10000.0, 2 * dims / self.dim) # (1, dim/2) # Calculate angle radians angle_rads = tf.matmul(positions, angle_rates) # (seq_length, dim/2) # Create rotation matrices cos = tf.cos(angle_rads) # (seq_length, dim/2) sin = tf.sin(angle_rads) # (seq_length, dim/2) # Reshape inputs for rotation x = tf.reshape(inputs, [batch_size, seq_length, -1, 2]) # (batch_size, seq_length, dim/2, 2) # Apply rotation x1, x2 = tf.unstack(x, axis=-1) # (batch_size, seq_length, dim/2) rotated = tf.stack([x1 * cos - x2 * sin, x1 * sin + x2 * cos], axis=-1) # (batch_size, seq_length, dim/2, 2) # Reshape back to original shape outputs = tf.reshape(rotated, [batch_size, seq_length, self.dim]) return outputs
[docs] def get_config(self) -> Dict[str, Union[int, str]]: """Get layer configuration. Returns: Dictionary containing layer configuration. """ config = {"dim": self.dim} base_config = super().get_config() return dict(list(base_config.items()) + list(config.items()))
def compute_output_shape(self, input_shape: Tuple[int, int, int]) -> Tuple[int, int, int]: return input_shape