fix default uniform init range for kvzch (#3336)

emlin · facebook-github-bot · commit 2503b812a2a4 · 2025-09-04T15:18:37.000-07:00
Summary: since zch v.Next is using a very large virtual table size, 2^50, the default uniform init value becomes very small, and when the weight dtype is half, those value essentially becomes 0. We have observed the weight init value is all 0 from the debug log: https://fburl.com/mlhub/aea9mbzf {F1981621246} Differential Revision: D81296621
diff --git a/torchrec/distributed/batched_embedding_kernel.py b/torchrec/distributed/batched_embedding_kernel.py
@@ -14,6 +14,7 @@
 import logging
 import tempfile
 from dataclasses import dataclass
+from math import sqrt
 from typing import (
     Any,
     cast,
@@ -151,7 +152,9 @@ def _populate_res_params(config: GroupedEmbeddingConfig) -> Tuple[bool, RESParam
     return (enable_raw_embedding_streaming, res_params)
 
 
-def _populate_ssd_tbe_params(config: GroupedEmbeddingConfig) -> Dict[str, Any]:
+def _populate_ssd_tbe_params(
+    config: GroupedEmbeddingConfig, is_kvzch: bool = False
+) -> Dict[str, Any]:
     """
     Construct SSD TBE params dict from config and fused params dict.
     """
@@ -192,6 +195,9 @@ def _populate_ssd_tbe_params(config: GroupedEmbeddingConfig) -> Dict[str, Any]:
         )
 
     # populate init min and max
+    if is_kvzch:
+        _generate_init_range_for_kvzch(ssd_tbe_params, config)
+
     if (
         "ssd_uniform_init_lower" not in ssd_tbe_params
         or "ssd_uniform_init_upper" not in ssd_tbe_params
@@ -245,6 +251,50 @@ def _populate_ssd_tbe_params(config: GroupedEmbeddingConfig) -> Dict[str, Any]:
     return ssd_tbe_params
 
 
+def _generate_init_range_for_kvzch(
+    tbe_params: Dict[str, Any],
+    config: GroupedEmbeddingConfig,
+) -> None:
+    """
+    Generate uniform init range for zero collision TBE based
+    """
+    # populate init min and max
+    if (
+        "ssd_uniform_init_lower" not in tbe_params
+        or "ssd_uniform_init_upper" not in tbe_params
+    ):
+        # Right now we do not support a per table init max and min. To use
+        # per table init max and min, either we allow it in SSD TBE, or we
+        # create one SSD TBE per table.
+        weights_precision = data_type_to_sparse_type(config.data_type)
+
+        # For Float32: use mathematically correct values, for Half: use safe range
+        max_size = 4_000_000_000  # 4B virtual embeddings
+        default_init_range = (
+            (-sqrt(1 / max_size), sqrt(1 / max_size))
+            if weights_precision.as_dtype() == torch.float32
+            else (-0.001, 0.001)
+        )
+
+        def get_init_value(
+            table_init_val: Optional[float], default_value: float
+        ) -> float:
+            return table_init_val if table_init_val is not None else default_value
+
+        init_mins = [
+            get_init_value(table.weight_init_min, default_init_range[0])
+            for table in config.embedding_tables
+        ]
+        init_maxs = [
+            get_init_value(table.weight_init_max, default_init_range[1])
+            for table in config.embedding_tables
+        ]
+
+        num_tables = len(config.embedding_tables)
+        tbe_params["ssd_uniform_init_lower"] = sum(init_mins) / num_tables
+        tbe_params["ssd_uniform_init_upper"] = sum(init_maxs) / num_tables
+
+
 def _populate_zero_collision_tbe_params(
     tbe_params: Dict[str, Any],
     sharded_local_buckets: List[Tuple[int, int, int]],
@@ -1878,7 +1928,7 @@ def __init__(
                 "not divisible by 4. "
             )
 
-        ssd_tbe_params = _populate_ssd_tbe_params(config)
+        ssd_tbe_params = _populate_ssd_tbe_params(config, is_kvzch=True)
         self._bucket_spec: List[Tuple[int, int, int]] = (
             _get_sharded_local_buckets_for_zero_collision(
                 self._config.embedding_tables, self._pg
@@ -2758,7 +2808,7 @@ def __init__(
                 "not divisible by 4. "
             )
 
-        ssd_tbe_params = _populate_ssd_tbe_params(config)
+        ssd_tbe_params = _populate_ssd_tbe_params(config, is_kvzch=True)
         self._bucket_spec: List[Tuple[int, int, int]] = (
             _get_sharded_local_buckets_for_zero_collision(
                 self._config.embedding_tables, self._pg