Skip to content

Commit

Permalink
ADLR/megatron-lm!1788 - chore: Reformat all documents
Browse files Browse the repository at this point in the history
  • Loading branch information
ko3n1g committed Aug 8, 2024
1 parent 2c47ea2 commit 703cc88
Show file tree
Hide file tree
Showing 165 changed files with 2,878 additions and 2,352 deletions.
4 changes: 4 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[flake8]
max-line-length = 100
extend-ignore = E203
per-file-ignores = __init__.py:F401
7 changes: 7 additions & 0 deletions .pylintrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[MASTER]
ignore=tests

[MESSAGES CONTROL]
disable=all

enable=C0115,C0116
4 changes: 3 additions & 1 deletion Dockerfile.linting
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ RUN sed -i -e 's/^APT/# APT/' -e 's/^DPkg/# DPkg/' \

RUN pip3 install --no-cache-dir \
black==24.4.2 \
isort
isort==5.13.2 \
flake8==7.1.0 \
pylint==3.2.6

COPY . /opt/megatron-lm

Expand Down
19 changes: 6 additions & 13 deletions megatron/core/datasets/bert_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ class BERTMaskedWordPieceDatasetConfig(MaskedWordPieceDatasetConfig):
"""Option to perform the next sequence prediction during sampling"""

def __post_init__(self) -> None:
"""Do asserts and set fields post init
"""
"""Do asserts and set fields post init"""
super().__post_init__()

assert self.classification_head is not None
Expand Down Expand Up @@ -73,22 +72,20 @@ def _key_config_attributes() -> List[str]:
"""
return super(
BERTMaskedWordPieceDataset, BERTMaskedWordPieceDataset
)._key_config_attributes() + ["classification_head",]
)._key_config_attributes() + ["classification_head"]

def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
"""Abstract method implementation
Args:
idx (int): The index into the dataset
Returns:
Dict[str, Union[int, numpy.ndarray]]: The
Dict[str, Union[int, numpy.ndarray]]: The
"""
idx_beg, idx_end, target_sequence_length = self.sample_index[idx]
sample = [self.dataset[i] for i in range(idx_beg, idx_end)]
numpy_random_state = numpy.random.RandomState(
seed=(self.config.random_seed + idx) % 2 ** 32
)
numpy_random_state = numpy.random.RandomState(seed=(self.config.random_seed + idx) % 2**32)

assert target_sequence_length <= self.config.sequence_length

Expand Down Expand Up @@ -127,11 +124,7 @@ def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
truncated = True

# Merge the subsegments and create the token assignment labels
tokens = [
self.config.tokenizer.cls,
*split_A,
self.config.tokenizer.sep,
]
tokens = [self.config.tokenizer.cls, *split_A, self.config.tokenizer.sep]
assignments = [0 for _ in range(1 + len(split_A) + 1)]
if split_B:
tokens += [*split_B, self.config.tokenizer.sep]
Expand Down
9 changes: 2 additions & 7 deletions megatron/core/datasets/blended_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,7 @@ def __len__(self) -> int:
def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
dataset_id = self.dataset_index[idx]
dataset_sample_id = self.dataset_sample_index[idx]
return {
"dataset_id": dataset_id,
**self.datasets[dataset_id][dataset_sample_id],
}
return {"dataset_id": dataset_id, **self.datasets[dataset_id][dataset_sample_id]}

def _build_indices(self) -> Tuple[numpy.ndarray, numpy.ndarray]:
"""Build and optionally cache the dataset index and the dataset sample index
Expand Down Expand Up @@ -129,9 +126,7 @@ def _build_indices(self) -> Tuple[numpy.ndarray, numpy.ndarray]:

if not path_to_cache or (not cache_hit and torch.distributed.get_rank() == 0):
log_single_rank(
logger,
logging.INFO,
f"Build and save the {type(self).__name__} indices",
logger, logging.INFO, f"Build and save the {type(self).__name__} indices"
)
self.built_anew_on_cache_miss = True

Expand Down
21 changes: 4 additions & 17 deletions megatron/core/datasets/blended_megatron_dataset_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,9 +156,7 @@ def build(self) -> List[Optional[TopLevelDataset]]:

return datasets

def _build_blended_dataset_splits(
self,
) -> List[Optional[TopLevelDataset]]:
def _build_blended_dataset_splits(self) -> List[Optional[TopLevelDataset]]:
"""Build all dataset splits according to the provided blend(s)
See the BlendedMegatronDatasetBuilder.build alias for more information.
Expand Down Expand Up @@ -306,10 +304,7 @@ def _build_blended_dataset_splits(
return blended_datasets

def _build_megatron_datasets_parallel(
self,
prefixes: List[str],
split: List[float],
sizes_per_dataset: List[List[int]],
self, prefixes: List[str], split: List[float], sizes_per_dataset: List[List[int]]
) -> List[List[Optional[MegatronDataset]]]:
"""Build the megatron datasets for a list of prefixes in parallel
Expand Down Expand Up @@ -369,11 +364,7 @@ def _threading_helper(
# i.e. meant for serial build, do not scale up.
num_workers *= min(2, max(1, torch.cuda.device_count()))
_threading_helper(
megatron_datasets,
num_workers,
prefixes,
split,
sizes_per_dataset,
megatron_datasets, num_workers, prefixes, split, sizes_per_dataset
)

torch.distributed.barrier()
Expand All @@ -389,11 +380,7 @@ def _threading_helper(
)
else:
_threading_helper(
megatron_datasets,
num_dataset_builder_threads,
prefixes,
split,
sizes_per_dataset,
megatron_datasets, num_dataset_builder_threads, prefixes, split, sizes_per_dataset
)

return megatron_datasets
Expand Down
8 changes: 3 additions & 5 deletions megatron/core/datasets/gpt_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,11 +108,9 @@ def __init__(
except Exception:
self._pad_token_id = _PAD_TOKEN_ID

(
self.document_index,
self.sample_index,
self.shuffle_index,
) = self._build_document_sample_shuffle_indices()
(self.document_index, self.sample_index, self.shuffle_index) = (
self._build_document_sample_shuffle_indices()
)

@staticmethod
def numel_low_level_dataset(low_level_dataset: IndexedDataset) -> int:
Expand Down
11 changes: 2 additions & 9 deletions megatron/core/datasets/indexed_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,12 +385,7 @@ def read(self, dtype: Type[numpy.number], count: int, offset: int) -> numpy.ndar
Returns:
numpy.ndarray: An array with `count` items and data-type `dtype` constructed from reading bytes from the data file starting at `offset`.
"""
return numpy.frombuffer(
self._bin_buffer,
dtype=dtype,
count=count,
offset=offset,
)
return numpy.frombuffer(self._bin_buffer, dtype=dtype, count=count, offset=offset)

def __del__(self) -> None:
"""Clean up the object."""
Expand Down Expand Up @@ -633,9 +628,7 @@ def __getitem__(
if isinstance(idx, (int, numpy.integer)):
sequence_pointer, sequence_length, sequence_mode = self.index[idx]
sequence = self.bin_reader.read(
dtype=self.index.dtype,
count=sequence_length,
offset=sequence_pointer,
dtype=self.index.dtype, count=sequence_length, offset=sequence_pointer
)
return (sequence, sequence_mode) if sequence_mode is not None else sequence
elif isinstance(idx, slice):
Expand Down
10 changes: 1 addition & 9 deletions megatron/core/datasets/masked_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,15 +154,7 @@ def _build_sample_index(
)
path_to_description = get_path_to("description.txt")
path_to_sample_index = get_path_to("sample_index.npy")
cache_hit = all(
map(
os.path.isfile,
[
path_to_description,
path_to_sample_index,
],
)
)
cache_hit = all(map(os.path.isfile, [path_to_description, path_to_sample_index]))

if self.num_samples is not None:
num_epochs = numpy.iinfo(numpy.int32).max - 1
Expand Down
48 changes: 9 additions & 39 deletions megatron/core/datasets/retro/db/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,23 +95,13 @@ def build_partial_db(
if proc_id in progress_proc_ids:
log_retro_rank_0(
" > building partial chunk db, proc %d / %d, docs %d:%d / %d."
% (
proc_id,
n_procs,
doc_start_id,
doc_end_id,
n_docs,
)
% (proc_id, n_procs, doc_start_id, doc_end_id, n_docs)
)

# Progress bars (snapshot of overall progress).
doc_id_iter = range(doc_start_id, doc_end_id)
pbar = (
tqdm(
doc_id_iter,
"parse doc chunks",
miniters=len(doc_id_iter) // 20,
)
tqdm(doc_id_iter, "parse doc chunks", miniters=len(doc_id_iter) // 20)
if proc_id in progress_proc_ids
else doc_id_iter
)
Expand Down Expand Up @@ -156,9 +146,7 @@ def build_partial_db(
# Re-tokenize.
chunk_end_idx = chunk_end_idxs[i]
gpt_token_ids = indexed_dataset.get(
idx=doc_id,
offset=chunk_start_idx,
length=chunk_end_idx - chunk_start_idx,
idx=doc_id, offset=chunk_start_idx, length=chunk_end_idx - chunk_start_idx
)
text = config.gpt_detokenize(gpt_token_ids.tolist())
bert_token_ids = config.bert_tokenize(text)
Expand All @@ -169,14 +157,7 @@ def build_partial_db(
else:
_chunk_db = chunk_db_valid
doc_size_map[doc_id] += 1
_chunk_db.append(
(
doc_id,
chunk_start_idx,
chunk_end_idx,
len(bert_token_ids),
)
)
_chunk_db.append((doc_id, chunk_start_idx, chunk_end_idx, len(bert_token_ids)))

return proc_id, chunk_db_valid, chunk_db_invalid, doc_size_map

Expand Down Expand Up @@ -269,10 +250,7 @@ def build_block_db(


def save_block_db(
block: dict,
chunk_db_valid: np.ndarray,
chunk_db_invalid: np.ndarray,
doc_offsets: np.ndarray,
block: dict, chunk_db_valid: np.ndarray, chunk_db_invalid: np.ndarray, doc_offsets: np.ndarray
) -> None:
"""Save block of chunked tokens to disk. These blocks are later used for
training and adding to the vector index.
Expand All @@ -291,10 +269,7 @@ def save_block_db(


def build_individual_db(
config: RetroPreprocessingConfig,
dataset_idx: int,
n_datasets: int,
dataset_info: dict,
config: RetroPreprocessingConfig, dataset_idx: int, n_datasets: int, dataset_info: dict
) -> None:
"""Process a single indexed dataset & extract chunks.
Expand Down Expand Up @@ -395,8 +370,7 @@ def build_individual_db(


def build_individual_dbs(
config: RetroPreprocessingConfig,
indexed_dataset_infos: List[Dict],
config: RetroPreprocessingConfig, indexed_dataset_infos: List[Dict]
) -> None:
"""Iterate each indexed dataset & process its chunks.
Expand All @@ -412,11 +386,7 @@ def build_individual_dbs(
# Progress.
log_retro_rank_0(
" > building individual db, dataset %d / %d ... '%s'."
% (
ds_idx,
len(indexed_dataset_infos),
ds_info["prefix"],
)
% (ds_idx, len(indexed_dataset_infos), ds_info["prefix"])
)

# Process single dataset.
Expand Down Expand Up @@ -562,7 +532,7 @@ def merge_dbs(project_dir: str, indexed_dataset_infos: List[Dict], db_type: str)
for ds_idx, ds_info in enumerate(indexed_dataset_infos):
log_retro_rank_0(
" > merging dbs; '%s', dataset %d / %d ... '%s'."
% (db_type, ds_idx, len(indexed_dataset_infos), ds_info["prefix"]),
% (db_type, ds_idx, len(indexed_dataset_infos), ds_info["prefix"])
)
individual_chunk_db: np.ndarray = get_individual_chunk_db(project_dir, ds_idx, ds_info)
individual_doc_offsets: np.ndarray = (
Expand Down
7 changes: 2 additions & 5 deletions megatron/core/datasets/retro/db/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

class DBDataset(torch.utils.data.Dataset):
"""Dataset for iterating chunks.
Args:
db_path (str): Path of HDF5-format chunk database.
indexed_datasets (List[IndexedDataset]): Indexed datasets used to build database.
Expand Down Expand Up @@ -85,10 +85,7 @@ def __getitem__(self, chunk_id: int) -> dict:
token_ids = token_ids.tolist()
token_ids += [self.eod_token_id] * (self.max_chunk_length - chunk_length)

return {
"doc_id": doc_id,
"text": np.array(token_ids, dtype=np.int64),
}
return {"doc_id": doc_id, "text": np.array(token_ids, dtype=np.int64)}

def load_doc_tuples(self) -> None:
"""Load the dataset & document ids.
Expand Down
6 changes: 2 additions & 4 deletions megatron/core/datasets/retro/db/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def get_db_dir(project_dir: str) -> str:
Args:
project_dir (str): Path to Retro project dir.
Returns:
Path of the DB sub-directory within the project.
"""
Expand Down Expand Up @@ -55,9 +55,7 @@ def init_indexed_dataset_infos(config: RetroPreprocessingConfig) -> List[Dict]:
prefix = data_blend[i + 1]
path = os.path.join(data_dir, prefix + ".bin")
assert os.path.exists(path), "couldn't find '%s'." % path
infos.append(
{"ratio": ratio, "prefix": prefix,}
)
infos.append({"ratio": ratio, "prefix": prefix})

# Load indexed datasets.
load_indexed_datasets(config.retro_project_dir, infos)
Expand Down
6 changes: 1 addition & 5 deletions megatron/core/datasets/retro/external_libs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,7 @@

import importlib

required_libs = [
"faiss",
"h5py",
"transformers", # for huggingface bert
]
required_libs = ["faiss", "h5py", "transformers"] # for huggingface bert

for lib in required_libs:
try:
Expand Down
2 changes: 1 addition & 1 deletion megatron/core/datasets/retro/index/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def get_empty_index_path(config: RetroPreprocessingConfig) -> str:
Args:
config (RetroPreprocessingConfig): Retro preprocessing config.
Returns:
Path to the empty (trained, but without added samples) vector index.
"""
Expand Down
2 changes: 1 addition & 1 deletion megatron/core/datasets/retro/index/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def get_index_class(cls, index_type: str) -> type:
Returns:
An `Index` sub-type corresponding to the `index_type`.
"""
return {"faiss-base": FaissBaseIndex, "faiss-par-add": FaissParallelAddIndex,}[index_type]
return {"faiss-base": FaissBaseIndex, "faiss-par-add": FaissParallelAddIndex}[index_type]

@classmethod
def get_index(cls, index_type: str) -> Index:
Expand Down
Loading

0 comments on commit 703cc88

Please sign in to comment.