Create bridge for every module in neox (#995)

degenfabian · bryce13950 · web-flow · commit bea73e18e813 · 2025-08-05T17:23:07.000+02:00
* Create bridge for every module in neox

* Use transformer lens config in split_qkv_functions

---------

Co-authored-by: Bryce Meyer &lt;bryce13950@gmail.com&gt;
diff --git a/transformer_lens/model_bridge/supported_architectures/bloom.py b/transformer_lens/model_bridge/supported_architectures/bloom.py
@@ -108,7 +108,7 @@ def __init__(self, cfg: Any) -> None:
         }
 
     def split_qkv_matrix(
-        self, attention_bridge: JointQKVAttentionBridge
+        self, original_attention_component: Any
     ) -> tuple[torch.nn.Linear, torch.nn.Linear, torch.nn.Linear]:
         """Split the QKV matrix into separate linear transformations.
         Args:
@@ -118,54 +118,49 @@ def split_qkv_matrix(
         """
 
         # Keep mypy happy
-        assert attention_bridge.original_component is not None
-        assert isinstance(attention_bridge.original_component.query_key_value, LinearBridge)
-        assert attention_bridge.original_component.query_key_value.original_component is not None
+        assert original_attention_component is not None
+        assert original_attention_component.query_key_value is not None
 
-        qkv_weights = attention_bridge.original_component.query_key_value.original_component.weight
+        qkv_weights = original_attention_component.query_key_value.weight
 
         # Keep mypy happy
         assert isinstance(qkv_weights, torch.Tensor)
 
-        d_head = self.cfg.hidden_size // self.cfg.n_head
-
-        # Original qkv_weights shape: [3 * n_head * d_head, d_model]
-        # We want to split it into [d_model, n_head * d_head] for each of Q, K, V
-        W_split = qkv_weights.T.reshape(self.cfg.hidden_size, 3, self.cfg.n_head * d_head)
+        # We want to split weights into [d_model, n_heads * d_head] for each of Q, K, V
+        W_split = qkv_weights.T.reshape(self.cfg.d_model, 3, self.cfg.n_heads * self.cfg.d_head)
 
         W_Q, W_K, W_V = W_split[:, 0, :], W_split[:, 1, :], W_split[:, 2, :]
 
-        qkv_bias = attention_bridge.original_component.query_key_value.original_component.bias
+        qkv_bias = original_attention_component.query_key_value.bias
 
         # Keep mypy happy
         assert isinstance(qkv_bias, torch.Tensor)
 
-        # Original qkv_bias shape: [3 * n_head * d_head]
-        # Reshape to [3, n_head * d_head] to split by Q, K, V
-        qkv_bias = qkv_bias.reshape(3, self.cfg.n_head * d_head)
+        # Reshape to [3, n_heads * d_head] to split by Q, K, V
+        qkv_bias = qkv_bias.reshape(3, self.cfg.n_heads * self.cfg.d_head)
 
         b_Q, b_K, b_V = qkv_bias[0, :], qkv_bias[1, :], qkv_bias[2, :]
 
         # Create nn.Linear modules
-        # W_Q, W_K, W_V shapes are [d_model, n_head * d_head]
+        # W_Q, W_K, W_V shapes are [d_model, n_heads * d_head]
         # nn.Linear expects weight shape [out_features, in_features]
-        # So for Linear(d_model, n_head * d_head), weight should be [n_head * d_head, d_model]
+        # So for Linear(d_model, n_heads * d_head), weight should be [n_heads * d_head, d_model]
         W_Q_transformation = torch.nn.Linear(W_Q.shape[0], W_Q.shape[1], bias=True)
         W_Q_transformation.weight = torch.nn.Parameter(
             W_Q.T
-        )  # Transpose to [n_head * d_head, d_model]
+        )  # Transpose to [n_heads * d_head, d_model]
         W_Q_transformation.bias = torch.nn.Parameter(b_Q)
 
         W_K_transformation = torch.nn.Linear(W_K.shape[0], W_K.shape[1], bias=True)
         W_K_transformation.weight = torch.nn.Parameter(
             W_K.T
-        )  # Transpose to [n_head * d_head, d_model]
+        )  # Transpose to [n_heads * d_head, d_model]
         W_K_transformation.bias = torch.nn.Parameter(b_K)
 
         W_V_transformation = torch.nn.Linear(W_V.shape[0], W_V.shape[1], bias=True)
         W_V_transformation.weight = torch.nn.Parameter(
             W_V.T
-        )  # Transpose to [n_head * d_head, d_model]
+        )  # Transpose to [n_heads * d_head, d_model]
         W_V_transformation.bias = torch.nn.Parameter(b_V)
 
         return W_Q_transformation, W_K_transformation, W_V_transformation
diff --git a/transformer_lens/model_bridge/supported_architectures/gpt2.py b/transformer_lens/model_bridge/supported_architectures/gpt2.py
@@ -97,7 +97,6 @@ def __init__(self, cfg: Any) -> None:
                         },
                         config={
                             "split_qkv_matrix": self.split_qkv_matrix,
-                            "original_model_config": self.cfg,
                         },
                     ),
                     "ln2": NormalizationBridge(name="ln_2"),
@@ -133,8 +132,6 @@ def split_qkv_matrix(
         # Keep mypy happy
         assert isinstance(qkv_weights, torch.Tensor)
 
-        d_head = self.cfg.n_embd // self.cfg.n_head
-
         # Original qkv_weights shape: [d_model, 3 * d_model]
         # Split into three equal parts along dimension 1 to get Q, K, V weights
         W_Q, W_K, W_V = torch.tensor_split(qkv_weights, 3, dim=1)
@@ -144,9 +141,9 @@ def split_qkv_matrix(
         # Keep mypy happy
         assert isinstance(qkv_bias, torch.Tensor)
 
-        # Original qkv_bias shape: [3 * n_head * d_head]
-        # Reshape to [3, n_head * d_head] to split by Q, K, V
-        qkv_bias = qkv_bias.reshape(3, self.cfg.n_head * d_head)
+        # Original qkv_bias shape: [3 * n_heads * d_head]
+        # Reshape to [3, n_heads * d_head] to split by Q, K, V
+        qkv_bias = qkv_bias.reshape(3, self.cfg.n_heads * self.cfg.d_head)
         b_Q, b_K, b_V = qkv_bias[0, :], qkv_bias[1, :], qkv_bias[2, :]
 
         # Create nn.Linear modules
diff --git a/transformer_lens/model_bridge/supported_architectures/neox.py b/transformer_lens/model_bridge/supported_architectures/neox.py
@@ -2,6 +2,8 @@
 
 from typing import Any
 
+import torch
+
 from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter
 from transformer_lens.model_bridge.conversion_utils.conversion_steps import (
     RearrangeWeightConversion,
@@ -12,9 +14,10 @@
     ChainWeightConversion,
 )
 from transformer_lens.model_bridge.generalized_components import (
-    AttentionBridge,
     BlockBridge,
     EmbeddingBridge,
+    JointQKVAttentionBridge,
+    LinearBridge,
     MLPBridge,
     NormalizationBridge,
     UnembeddingBridge,
@@ -131,16 +134,82 @@ def __init__(self, cfg: Any) -> None:
 
         self.component_mapping = {
             "embed": EmbeddingBridge(name="gpt_neox.embed_in"),
-            "pos_embed": EmbeddingBridge(name="gpt_neox.embed_pos"),
+            "rotary_emb": EmbeddingBridge(name="gpt_neox.rotary_emb"),
             "blocks": BlockBridge(
                 name="gpt_neox.layers",
                 submodules={
                     "ln1": NormalizationBridge(name="input_layernorm"),
                     "ln2": NormalizationBridge(name="post_attention_layernorm"),
-                    "attn": AttentionBridge(name="attention"),
-                    "mlp": MLPBridge(name="mlp"),
+                    "attn": JointQKVAttentionBridge(
+                        name="attention",
+                        submodules={
+                            "W_QKV": LinearBridge(
+                                name="query_key_value",
+                            ),
+                            "W_O": LinearBridge(name="dense"),
+                        },
+                        config={"split_qkv_matrix": self.split_qkv_matrix},
+                    ),
+                    "mlp": MLPBridge(
+                        name="mlp",
+                        submodules={
+                            "W_in": LinearBridge(name="dense_h_to_4h"),
+                            "W_out": LinearBridge(name="dense_4h_to_h"),
+                        },
+                    ),
                 },
             ),
             "ln_final": NormalizationBridge(name="gpt_neox.final_layer_norm"),
             "unembed": UnembeddingBridge(name="embed_out"),
         }
+
+    def split_qkv_matrix(
+        self, original_attention_component: Any
+    ) -> tuple[torch.nn.Linear, torch.nn.Linear, torch.nn.Linear]:
+        """Split the QKV matrix into separate linear transformations.
+        Args:
+            attention_component: The original attention layer component
+        Returns:
+            Tuple of nn.Linear modules for Q, K, and V transformations
+        """
+
+        # Keep mypy happy
+        assert original_attention_component is not None
+        assert original_attention_component.query_key_value is not None
+
+        qkv_weights = original_attention_component.query_key_value.weight
+
+        # Keep mypy happy
+        assert isinstance(qkv_weights, torch.Tensor)
+
+        # Original qkv_weights shape: [d_model * 3 * d_model]
+        # Split into three equal parts along dimension 1 to get Q, K, V weights
+        W_Q, W_K, W_V = torch.tensor_split(qkv_weights, 3, dim=1)
+
+        qkv_bias = original_attention_component.query_key_value.bias
+
+        # Keep mypy happy
+        assert isinstance(qkv_bias, torch.Tensor)
+
+        # Original qkv_bias shape: [n_heads * 3 * d_head]
+        # Reshape to [3, n_heads * d_head] to split by Q, K, V
+        qkv_bias = qkv_bias.reshape(3, self.cfg.n_heads * self.cfg.d_head)
+        b_Q, b_K, b_V = qkv_bias[0, :], qkv_bias[1, :], qkv_bias[2, :]
+
+        # Create nn.Linear modules
+        # After tensor_split, W_Q, W_K, W_V shapes are [d_model, d_model] ([in_features, out_features])
+        # nn.Linear expects weight shape [out_features, in_features]
+        # So we need to transpose the weights
+        W_Q_transformation = torch.nn.Linear(W_Q.shape[0], W_Q.shape[1], bias=True)
+        W_Q_transformation.weight = torch.nn.Parameter(W_Q.T)
+        W_Q_transformation.bias = torch.nn.Parameter(b_Q)
+
+        W_K_transformation = torch.nn.Linear(W_K.shape[0], W_K.shape[1], bias=True)
+        W_K_transformation.weight = torch.nn.Parameter(W_K.T)
+        W_K_transformation.bias = torch.nn.Parameter(b_K)
+
+        W_V_transformation = torch.nn.Linear(W_V.shape[0], W_V.shape[1], bias=True)
+        W_V_transformation.weight = torch.nn.Parameter(W_V.T)
+        W_V_transformation.bias = torch.nn.Parameter(b_V)
+
+        return W_Q_transformation, W_K_transformation, W_V_transformation