Enhance TrainPipelineSparseDist logging to help differentiate data loading patterns in train pipeline (#3350)

Pu Chen · facebook-github-bot · commit 85ec396de905 · 2025-09-04T01:12:39.000-07:00
Summary: Pull Request resolved: #3350 Observed inconsistent data loading behaviors in APS train_module_train_step. Expected 3 batches loaded on first invocation of train loop, but sometimes only 1 batch loading shows in trace ([link](https://www.internalfb.com/intern/sbdive/?id=tree%2Fttfb%2Fttfb_ai_lab_APS_mtml_ctr_cmf_rc1_baseline_gpu-f788555024-fbd033a0-89b2-4b72-b540-346901657b25-treatment-1&bucket=sbdive)) despite increasing trace frequency from 500ms to 50ms. Added logs to differentiate data loading patterns. Perf impact: logs are added only when data loader is exhausted Reviewed By: andywag Differential Revision: D81418443 fbshipit-source-id: 98ccb5cb480bf31572e99b9796cf375ef676d125
diff --git a/torchrec/distributed/train_pipeline/train_pipelines.py b/torchrec/distributed/train_pipeline/train_pipelines.py
@@ -446,6 +446,11 @@ def __init__(
         self._apply_jit = apply_jit
         self._enqueue_batch_after_forward = enqueue_batch_after_forward
 
+        logger.info(
+            f"enqueue_batch_after_forward: {self._enqueue_batch_after_forward} "
+            f"execute_all_batches: {self._execute_all_batches}"
+        )
+
         if device.type == "cuda":
             # use two data streams to support two concurrent batches
             # Dynamo does not support cuda stream specificaiton,
@@ -624,6 +629,7 @@ def fill_pipeline(self, dataloader_iter: Iterator[In]) -> None:
 
         # batch i, data (batch) and context
         if not self.enqueue_batch(dataloader_iter):
+            logger.info("fill_pipeline: failed to load batch i")
             return
 
         # modify the (sharded) sparse module forward, and invoke the first part of input_dist
@@ -637,6 +643,7 @@ def fill_pipeline(self, dataloader_iter: Iterator[In]) -> None:
 
         # batch i+1
         if not self.enqueue_batch(dataloader_iter):
+            logger.info("fill_pipeline: failed to load batch i+1")
             return
 
     def _wait_for_batch(self) -> None:
@@ -801,7 +808,14 @@ def copy_batch_to_gpu(
                 if batch is not None:
                     batch = _to_device(batch, self._device, non_blocking=True)
                 elif not self._execute_all_batches:
+                    logger.info(
+                        "copy_batch_to_gpu: raising StopIteration for None Batch (execute_all_batches=False)"
+                    )
                     raise StopIteration
+                else:
+                    logger.info(
+                        "copy_batch_to_gpu: returning None batch (execute_all_batches=True)"
+                    )
                 return batch, context
 
     def _next_batch(self, dataloader_iter: Iterator[In]) -> Optional[In]:
@@ -820,6 +834,9 @@ def _next_batch(self, dataloader_iter: Iterator[In]) -> Optional[In]:
                 batch = next(dataloader_iter, None)
             if batch is None:
                 self._dataloader_exhausted = True
+
+        if batch is None:
+            logger.info("_next_batch: dataloader exhausted")
         return batch
 
     def start_sparse_data_dist(