ADLR/megatron-lm!1788 - chore: Reformat all documents

TrustLLMeu · Aug 8, 2024 · 703cc88 · 703cc88
1 parent 2c47ea2
commit 703cc88
Show file tree

Hide file tree

Showing 165 changed files with 2,878 additions and 2,352 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,4 @@
+[flake8]
+max-line-length = 100
+extend-ignore = E203
+per-file-ignores = __init__.py:F401
diff --git a/.pylintrc b/.pylintrc
@@ -0,0 +1,7 @@
+[MASTER]
+ignore=tests
+
+[MESSAGES CONTROL]
+disable=all
+
+enable=C0115,C0116
diff --git a/Dockerfile.linting b/Dockerfile.linting
@@ -10,7 +10,9 @@ RUN sed -i -e 's/^APT/# APT/' -e 's/^DPkg/# DPkg/' \
 
 RUN pip3 install --no-cache-dir \
       black==24.4.2 \
-      isort
+      isort==5.13.2 \
+      flake8==7.1.0 \
+      pylint==3.2.6
 
 COPY . /opt/megatron-lm
 

diff --git a/megatron/core/datasets/bert_dataset.py b/megatron/core/datasets/bert_dataset.py
@@ -21,8 +21,7 @@ class BERTMaskedWordPieceDatasetConfig(MaskedWordPieceDatasetConfig):
     """Option to perform the next sequence prediction during sampling"""
 
     def __post_init__(self) -> None:
-        """Do asserts and set fields post init
-        """
+        """Do asserts and set fields post init"""
         super().__post_init__()
 
         assert self.classification_head is not None
@@ -73,22 +72,20 @@ def _key_config_attributes() -> List[str]:
         """
         return super(
             BERTMaskedWordPieceDataset, BERTMaskedWordPieceDataset
-        )._key_config_attributes() + ["classification_head",]
+        )._key_config_attributes() + ["classification_head"]
 
     def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
         """Abstract method implementation
- 
+
         Args:
             idx (int): The index into the dataset
 
         Returns:
-            Dict[str, Union[int, numpy.ndarray]]: The 
+            Dict[str, Union[int, numpy.ndarray]]: The
         """
         idx_beg, idx_end, target_sequence_length = self.sample_index[idx]
         sample = [self.dataset[i] for i in range(idx_beg, idx_end)]
-        numpy_random_state = numpy.random.RandomState(
-            seed=(self.config.random_seed + idx) % 2 ** 32
-        )
+        numpy_random_state = numpy.random.RandomState(seed=(self.config.random_seed + idx) % 2**32)
 
         assert target_sequence_length <= self.config.sequence_length
 
@@ -127,11 +124,7 @@ def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
             truncated = True
 
         # Merge the subsegments and create the token assignment labels
-        tokens = [
-            self.config.tokenizer.cls,
-            *split_A,
-            self.config.tokenizer.sep,
-        ]
+        tokens = [self.config.tokenizer.cls, *split_A, self.config.tokenizer.sep]
         assignments = [0 for _ in range(1 + len(split_A) + 1)]
         if split_B:
             tokens += [*split_B, self.config.tokenizer.sep]

diff --git a/megatron/core/datasets/blended_dataset.py b/megatron/core/datasets/blended_dataset.py
@@ -93,10 +93,7 @@ def __len__(self) -> int:
     def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
         dataset_id = self.dataset_index[idx]
         dataset_sample_id = self.dataset_sample_index[idx]
-        return {
-            "dataset_id": dataset_id,
-            **self.datasets[dataset_id][dataset_sample_id],
-        }
+        return {"dataset_id": dataset_id, **self.datasets[dataset_id][dataset_sample_id]}
 
     def _build_indices(self) -> Tuple[numpy.ndarray, numpy.ndarray]:
         """Build and optionally cache the dataset index and the dataset sample index
@@ -129,9 +126,7 @@ def _build_indices(self) -> Tuple[numpy.ndarray, numpy.ndarray]:
 
         if not path_to_cache or (not cache_hit and torch.distributed.get_rank() == 0):
             log_single_rank(
-                logger,
-                logging.INFO,
-                f"Build and save the {type(self).__name__} indices",
+                logger, logging.INFO, f"Build and save the {type(self).__name__} indices"
             )
             self.built_anew_on_cache_miss = True
 

diff --git a/megatron/core/datasets/blended_megatron_dataset_builder.py b/megatron/core/datasets/blended_megatron_dataset_builder.py
@@ -156,9 +156,7 @@ def build(self) -> List[Optional[TopLevelDataset]]:
 
         return datasets
 
-    def _build_blended_dataset_splits(
-        self,
-    ) -> List[Optional[TopLevelDataset]]:
+    def _build_blended_dataset_splits(self) -> List[Optional[TopLevelDataset]]:
         """Build all dataset splits according to the provided blend(s)
 
         See the BlendedMegatronDatasetBuilder.build alias for more information.
@@ -306,10 +304,7 @@ def _build_blended_dataset_splits(
             return blended_datasets
 
     def _build_megatron_datasets_parallel(
-        self,
-        prefixes: List[str],
-        split: List[float],
-        sizes_per_dataset: List[List[int]],
+        self, prefixes: List[str], split: List[float], sizes_per_dataset: List[List[int]]
     ) -> List[List[Optional[MegatronDataset]]]:
         """Build the megatron datasets for a list of prefixes in parallel
 
@@ -369,11 +364,7 @@ def _threading_helper(
                     # i.e. meant for serial build, do not scale up.
                     num_workers *= min(2, max(1, torch.cuda.device_count()))
                 _threading_helper(
-                    megatron_datasets,
-                    num_workers,
-                    prefixes,
-                    split,
-                    sizes_per_dataset,
+                    megatron_datasets, num_workers, prefixes, split, sizes_per_dataset
                 )
 
             torch.distributed.barrier()
@@ -389,11 +380,7 @@ def _threading_helper(
                 )
         else:
             _threading_helper(
-                megatron_datasets,
-                num_dataset_builder_threads,
-                prefixes,
-                split,
-                sizes_per_dataset,
+                megatron_datasets, num_dataset_builder_threads, prefixes, split, sizes_per_dataset
             )
 
         return megatron_datasets

diff --git a/megatron/core/datasets/gpt_dataset.py b/megatron/core/datasets/gpt_dataset.py
@@ -108,11 +108,9 @@ def __init__(
         except Exception:
             self._pad_token_id = _PAD_TOKEN_ID
 
-        (
-            self.document_index,
-            self.sample_index,
-            self.shuffle_index,
-        ) = self._build_document_sample_shuffle_indices()
+        (self.document_index, self.sample_index, self.shuffle_index) = (
+            self._build_document_sample_shuffle_indices()
+        )
 
     @staticmethod
     def numel_low_level_dataset(low_level_dataset: IndexedDataset) -> int:

diff --git a/megatron/core/datasets/indexed_dataset.py b/megatron/core/datasets/indexed_dataset.py
@@ -385,12 +385,7 @@ def read(self, dtype: Type[numpy.number], count: int, offset: int) -> numpy.ndar
         Returns:
             numpy.ndarray: An array with `count` items and data-type `dtype` constructed from reading bytes from the data file starting at `offset`.
         """
-        return numpy.frombuffer(
-            self._bin_buffer,
-            dtype=dtype,
-            count=count,
-            offset=offset,
-        )
+        return numpy.frombuffer(self._bin_buffer, dtype=dtype, count=count, offset=offset)
 
     def __del__(self) -> None:
         """Clean up the object."""
@@ -633,9 +628,7 @@ def __getitem__(
         if isinstance(idx, (int, numpy.integer)):
             sequence_pointer, sequence_length, sequence_mode = self.index[idx]
             sequence = self.bin_reader.read(
-                dtype=self.index.dtype,
-                count=sequence_length,
-                offset=sequence_pointer,
+                dtype=self.index.dtype, count=sequence_length, offset=sequence_pointer
             )
             return (sequence, sequence_mode) if sequence_mode is not None else sequence
         elif isinstance(idx, slice):

diff --git a/megatron/core/datasets/masked_dataset.py b/megatron/core/datasets/masked_dataset.py
@@ -154,15 +154,7 @@ def _build_sample_index(
         )
         path_to_description = get_path_to("description.txt")
         path_to_sample_index = get_path_to("sample_index.npy")
-        cache_hit = all(
-            map(
-                os.path.isfile,
-                [
-                    path_to_description,
-                    path_to_sample_index,
-                ],
-            )
-        )
+        cache_hit = all(map(os.path.isfile, [path_to_description, path_to_sample_index]))
 
         if self.num_samples is not None:
             num_epochs = numpy.iinfo(numpy.int32).max - 1

diff --git a/megatron/core/datasets/retro/db/build.py b/megatron/core/datasets/retro/db/build.py
@@ -95,23 +95,13 @@ def build_partial_db(
     if proc_id in progress_proc_ids:
         log_retro_rank_0(
             " > building partial chunk db, proc %d / %d, docs %d:%d / %d."
-            % (
-                proc_id,
-                n_procs,
-                doc_start_id,
-                doc_end_id,
-                n_docs,
-            )
+            % (proc_id, n_procs, doc_start_id, doc_end_id, n_docs)
         )
 
     # Progress bars (snapshot of overall progress).
     doc_id_iter = range(doc_start_id, doc_end_id)
     pbar = (
-        tqdm(
-            doc_id_iter,
-            "parse doc chunks",
-            miniters=len(doc_id_iter) // 20,
-        )
+        tqdm(doc_id_iter, "parse doc chunks", miniters=len(doc_id_iter) // 20)
         if proc_id in progress_proc_ids
         else doc_id_iter
     )
@@ -156,9 +146,7 @@ def build_partial_db(
             # Re-tokenize.
             chunk_end_idx = chunk_end_idxs[i]
             gpt_token_ids = indexed_dataset.get(
-                idx=doc_id,
-                offset=chunk_start_idx,
-                length=chunk_end_idx - chunk_start_idx,
+                idx=doc_id, offset=chunk_start_idx, length=chunk_end_idx - chunk_start_idx
             )
             text = config.gpt_detokenize(gpt_token_ids.tolist())
             bert_token_ids = config.bert_tokenize(text)
@@ -169,14 +157,7 @@ def build_partial_db(
             else:
                 _chunk_db = chunk_db_valid
                 doc_size_map[doc_id] += 1
-            _chunk_db.append(
-                (
-                    doc_id,
-                    chunk_start_idx,
-                    chunk_end_idx,
-                    len(bert_token_ids),
-                )
-            )
+            _chunk_db.append((doc_id, chunk_start_idx, chunk_end_idx, len(bert_token_ids)))
 
     return proc_id, chunk_db_valid, chunk_db_invalid, doc_size_map
 
@@ -269,10 +250,7 @@ def build_block_db(
 
 
 def save_block_db(
-    block: dict,
-    chunk_db_valid: np.ndarray,
-    chunk_db_invalid: np.ndarray,
-    doc_offsets: np.ndarray,
+    block: dict, chunk_db_valid: np.ndarray, chunk_db_invalid: np.ndarray, doc_offsets: np.ndarray
 ) -> None:
     """Save block of chunked tokens to disk. These blocks are later used for
     training and adding to the vector index.
@@ -291,10 +269,7 @@ def save_block_db(
 
 
 def build_individual_db(
-    config: RetroPreprocessingConfig,
-    dataset_idx: int,
-    n_datasets: int,
-    dataset_info: dict,
+    config: RetroPreprocessingConfig, dataset_idx: int, n_datasets: int, dataset_info: dict
 ) -> None:
     """Process a single indexed dataset & extract chunks.
 
@@ -395,8 +370,7 @@ def build_individual_db(
 
 
 def build_individual_dbs(
-    config: RetroPreprocessingConfig,
-    indexed_dataset_infos: List[Dict],
+    config: RetroPreprocessingConfig, indexed_dataset_infos: List[Dict]
 ) -> None:
     """Iterate each indexed dataset & process its chunks.
 
@@ -412,11 +386,7 @@ def build_individual_dbs(
         # Progress.
         log_retro_rank_0(
             " > building individual db, dataset %d / %d ... '%s'."
-            % (
-                ds_idx,
-                len(indexed_dataset_infos),
-                ds_info["prefix"],
-            )
+            % (ds_idx, len(indexed_dataset_infos), ds_info["prefix"])
         )
 
         # Process single dataset.
@@ -562,7 +532,7 @@ def merge_dbs(project_dir: str, indexed_dataset_infos: List[Dict], db_type: str)
         for ds_idx, ds_info in enumerate(indexed_dataset_infos):
             log_retro_rank_0(
                 " > merging dbs; '%s', dataset %d / %d ... '%s'."
-                % (db_type, ds_idx, len(indexed_dataset_infos), ds_info["prefix"]),
+                % (db_type, ds_idx, len(indexed_dataset_infos), ds_info["prefix"])
             )
             individual_chunk_db: np.ndarray = get_individual_chunk_db(project_dir, ds_idx, ds_info)
             individual_doc_offsets: np.ndarray = (

diff --git a/megatron/core/datasets/retro/db/dataset.py b/megatron/core/datasets/retro/db/dataset.py
@@ -17,7 +17,7 @@
 
 class DBDataset(torch.utils.data.Dataset):
     """Dataset for iterating chunks.
-    
+
     Args:
         db_path (str): Path of HDF5-format chunk database.
         indexed_datasets (List[IndexedDataset]): Indexed datasets used to build database.
@@ -85,10 +85,7 @@ def __getitem__(self, chunk_id: int) -> dict:
             token_ids = token_ids.tolist()
             token_ids += [self.eod_token_id] * (self.max_chunk_length - chunk_length)
 
-        return {
-            "doc_id": doc_id,
-            "text": np.array(token_ids, dtype=np.int64),
-        }
+        return {"doc_id": doc_id, "text": np.array(token_ids, dtype=np.int64)}
 
     def load_doc_tuples(self) -> None:
         """Load the dataset & document ids.

diff --git a/megatron/core/datasets/retro/db/utils.py b/megatron/core/datasets/retro/db/utils.py
@@ -22,7 +22,7 @@ def get_db_dir(project_dir: str) -> str:
 
     Args:
         project_dir (str): Path to Retro project dir.
-    
+
     Returns:
         Path of the DB sub-directory within the project.
     """
@@ -55,9 +55,7 @@ def init_indexed_dataset_infos(config: RetroPreprocessingConfig) -> List[Dict]:
         prefix = data_blend[i + 1]
         path = os.path.join(data_dir, prefix + ".bin")
         assert os.path.exists(path), "couldn't find '%s'." % path
-        infos.append(
-            {"ratio": ratio, "prefix": prefix,}
-        )
+        infos.append({"ratio": ratio, "prefix": prefix})
 
     # Load indexed datasets.
     load_indexed_datasets(config.retro_project_dir, infos)

diff --git a/megatron/core/datasets/retro/external_libs.py b/megatron/core/datasets/retro/external_libs.py
@@ -4,11 +4,7 @@
 
 import importlib
 
-required_libs = [
-    "faiss",
-    "h5py",
-    "transformers",  # for huggingface bert
-]
+required_libs = ["faiss", "h5py", "transformers"]  # for huggingface bert
 
 for lib in required_libs:
     try:

diff --git a/megatron/core/datasets/retro/index/build.py b/megatron/core/datasets/retro/index/build.py
@@ -41,7 +41,7 @@ def get_empty_index_path(config: RetroPreprocessingConfig) -> str:
 
     Args:
         config (RetroPreprocessingConfig): Retro preprocessing config.
-    
+
     Returns:
         Path to the empty (trained, but without added samples) vector index.
     """

diff --git a/megatron/core/datasets/retro/index/factory.py b/megatron/core/datasets/retro/index/factory.py
@@ -23,7 +23,7 @@ def get_index_class(cls, index_type: str) -> type:
         Returns:
             An `Index` sub-type corresponding to the `index_type`.
         """
-        return {"faiss-base": FaissBaseIndex, "faiss-par-add": FaissParallelAddIndex,}[index_type]
+        return {"faiss-base": FaissBaseIndex, "faiss-par-add": FaissParallelAddIndex}[index_type]
 
     @classmethod
     def get_index(cls, index_type: str) -> Index:
-Original file line number
+Diff line change
@@ Expand Up @@
         Args:
             config (RetroPreprocessingConfig): Retro preprocessing config.
         Returns:
             Path to the empty (trained, but without added samples) vector index.
         """
@@ Expand Down @@