fix load checkpoint issue when load_state_dict with assign=True (#3369)

TroyGarden · facebook-github-bot · commit b70845a9d595 · 2025-09-11T06:19:31.000-07:00
Summary: Pull Request resolved: #3369 # Issue Summary The PositionWeightedModuleCollection class in TorchRec maintains two separate references to position weight parameters: position_weights (ParameterDict) and position_weights_dict (regular dict). When checkpoints are loaded using load_state_dict(..., assign=True), the position_weights_dict becomes desynchronized and continues pointing to stale parameter tensors. # Impact - Silent correctness failures during model evaluation and inference - Training instability when loading checkpoints for recurring training - Production issues in model serving pipelines - Inconsistent results between fresh model initialization and checkpoint loading # Root Cause Analysis 1. Initialization: During module construction, position_weights_dict[key] = self.position_weights[key] creates references to parameter tensors (line 190) 2. Checkpoint Loading: When load_state_dict(..., assign=True) is called, PyTorch replaces the actual parameter tensors with new ones from the checkpoint 3. Stale References: The position_weights_dict continues pointing to the old parameter tensors that are no longer part of the model 4. Silent Failure: The get_weights_list() function uses position_weights_dict (line 212), causing the model to use incorrect weights without any error Reviewed By: dyerinoon Differential Revision: D81749871 fbshipit-source-id: 81f6c4dddef42b377598511396b3210669e4db36
diff --git a/torchrec/modules/feature_processor_.py b/torchrec/modules/feature_processor_.py
@@ -10,11 +10,12 @@
 #!/usr/bin/env python3
 
 import abc
-from typing import Dict, List, Optional
+from typing import Dict, List, Mapping, Optional
 
 import torch
 
 from torch import nn
+from torch.nn.modules.module import _IncompatibleKeys
 
 from torchrec.pt2.checks import is_non_strict_exporting
 from torchrec.sparse.jagged_tensor import JaggedTensor, KeyedJaggedTensor
@@ -232,3 +233,15 @@ def _apply(self, *args, **kwargs) -> nn.Module:
             self.position_weights_dict[k] = param
 
         return self
+
+    def load_state_dict(
+        self,
+        state_dict: Mapping[str, torch.Tensor],
+        strict: bool = True,
+        assign: bool = False,
+    ) -> _IncompatibleKeys:
+        result = super().load_state_dict(state_dict, strict, assign)
+        # Re-sync after loading
+        for k, param in self.position_weights.items():
+            self.position_weights_dict[k] = param
+        return result
diff --git a/torchrec/modules/tests/test_feature_processor_.py b/torchrec/modules/tests/test_feature_processor_.py
@@ -209,3 +209,43 @@ def test_to(self) -> None:
         self.assertTrue(
             all(param.is_meta for param in pwmc.position_weights_dict.values())
         )
+
+    def test_load_state_dict(self) -> None:
+        values = torch.tensor([10, 11, 12, 20, 21, 22])
+        lengths = torch.tensor([3, 3])
+        kjt = KeyedJaggedTensor(
+            keys=["feature1", "feature2"], values=values, lengths=lengths
+        )
+
+        # Step 1: Create module and observe initial state
+        max_feature_lengths = {"feature1": 3, "feature2": 3}
+        module = PositionWeightedModuleCollection(max_feature_lengths)
+
+        # Before checkpoint loading, position_weights_dict is a element-wise reference of position_weights
+        for f in ["feature1", "feature2"]:
+            self.assertIs(
+                module.position_weights[f],
+                module.position_weights_dict[f],
+            )
+
+        output = module(kjt)
+        expected = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+        self.assertListEqual(output.weights().tolist(), expected)
+
+        # Step 2: Simulate checkpoint loading with assign=True
+        checkpoint = {
+            "position_weights.feature1": torch.tensor([2.0, 3.0, 4.0]),
+            "position_weights.feature2": torch.tensor([5.0, 6.0, 7.0]),
+        }
+        module.load_state_dict(checkpoint, strict=False, assign=True)
+
+        # After checkpoint loading, position_weights_dict is a element-wise reference of position_weights
+        for f in ["feature1", "feature2"]:
+            self.assertIs(
+                module.position_weights[f],
+                module.position_weights_dict[f],
+            )
+
+        output = module(kjt)
+        expected = [2.0, 3.0, 4.0, 5.0, 6.0, 7.0]
+        self.assertListEqual(output.weights().tolist(), expected)