Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Possible changes for biobert config changes #655

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,9 @@
"editor.rulers": [
120
],
"autoDocstring.docstringFormat": "google-notypes"
"autoDocstring.docstringFormat": "google-notypes",
"launch": {
"configurations": [],
"compounds": []
}
}
30 changes: 13 additions & 17 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,6 @@ RUN rustup set profile minimal && \

FROM ${BASE_IMAGE} AS bionemo2-base

# Install NeMo dependencies.
WORKDIR /build

ARG MAX_JOBS=4
ENV MAX_JOBS=${MAX_JOBS}

# Install core apt packages.
RUN --mount=type=cache,id=apt-cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,id=apt-lib,target=/var/lib/apt,sharing=locked \
Expand All @@ -34,13 +28,17 @@ apt-get install -qyy \
git \
curl \
pre-commit \
sudo
sudo \
gnupg
apt-get upgrade -qyy \
rsync
rm -rf /tmp/* /var/tmp/*
EOF

RUN apt-get install -y gnupg
ARG TE_COMMIT=2215fa5c7557b66034068816020f9f611019e457
RUN NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi \
pip --disable-pip-version-check --no-cache-dir install \
git+https://github.com/NVIDIA/TransformerEngine.git@${TE_COMMIT}

# Check the nemo dependency for causal conv1d and make sure this checkout
# tag matches. If not, update the tag in the following line.
Expand All @@ -57,9 +55,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMU_RUN_T

RUN mkdir -p /workspace/bionemo2/

# Delete the temporary /build directory.
WORKDIR /workspace
RUN rm -rf /build

# Addressing Security Scan Vulnerabilities
RUN rm -rf /opt/pytorch/pytorch/third_party/onnx
Expand All @@ -79,7 +75,7 @@ ENV UV_LINK_MODE=copy \
# Install the bionemo-geometric requirements ahead of copying over the rest of the repo, so that we can cache their
# installation. These involve building some torch extensions, so they can take a while to install.
RUN --mount=type=bind,source=./sub-packages/bionemo-geometric/requirements.txt,target=/requirements-pyg.txt \
--mount=type=cache,target=/root/.cache \
--mount=type=cache,target=/root/.cache \
uv pip install --no-build-isolation -r /requirements-pyg.txt

COPY --from=rust-env /usr/local/cargo /usr/local/cargo
Expand All @@ -99,9 +95,9 @@ COPY ./sub-packages /workspace/bionemo2/sub-packages
# Includes a hack to install tensorstore 0.1.45, which doesn't distribute a pypi wheel for python 3.12, and the metadata
# in the source distribution doesn't match the expected pypi version.
RUN --mount=type=bind,source=./.git,target=./.git \
--mount=type=bind,source=./requirements-test.txt,target=/requirements-test.txt \
--mount=type=bind,source=./requirements-cve.txt,target=/requirements-cve.txt \
--mount=type=cache,target=/root/.cache <<EOF
--mount=type=bind,source=./requirements-test.txt,target=/requirements-test.txt \
--mount=type=bind,source=./requirements-cve.txt,target=/requirements-cve.txt \
--mount=type=cache,target=/root/.cache <<EOF
set -eo pipefail

uv pip install maturin --no-build-isolation
Expand All @@ -124,8 +120,8 @@ EOF
FROM ${BASE_IMAGE} AS dev

RUN --mount=type=cache,id=apt-cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,id=apt-lib,target=/var/lib/apt,sharing=locked \
<<EOF
--mount=type=cache,id=apt-lib,target=/var/lib/apt,sharing=locked \
<<EOF
set -eo pipefail
apt-get update -qy
apt-get install -qyy \
Expand Down Expand Up @@ -168,7 +164,7 @@ ENV PATH="/usr/local/cargo/bin:/usr/local/rustup/bin:${PATH}"
ENV RUSTUP_HOME="/usr/local/rustup"

RUN --mount=type=bind,source=./requirements-dev.txt,target=/workspace/bionemo2/requirements-dev.txt \
--mount=type=cache,target=/root/.cache <<EOF
--mount=type=cache,target=/root/.cache <<EOF
set -eo pipefail
uv pip install -r /workspace/bionemo2/requirements-dev.txt
rm -rf /tmp/*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,7 @@ class ESM2GenericConfig(BioBertConfig[ESM2ModelT, MegatronLossType]):
include_input_ids: bool = False
skip_logits: bool = False
return_only_hidden_states: bool = False # return logits
variable_seq_lengths: bool = False

def __post_init__(self):
# TODO, as a validator?
Expand Down
159 changes: 84 additions & 75 deletions sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,37 +129,15 @@ class BioBertOutput(BioBertOutputCore, total=False):

# TODO make this a base class without the language head and pooler
class MegatronBioBertModel(LanguageModule):
"""Transformer language model.

Args:
config: transformer config
num_tokentypes: Set to 2 when args.bert_binary_head is True, and 0 otherwise. Defaults to 0.
transformer_layer_spec: Specifies module to use for transformer layers
vocab_size: vocabulary size
max_sequence_length: maximum size of sequence. This is used for positional embedding
pre_process: Include embedding layer (used with pipeline parallelism)
post_process: Include an output layer (used with pipeline parallelism)
parallel_output: Do not gather the outputs, keep them split across tensor parallel ranks
share_embeddings_and_output_weights: When True, input embeddings and output logit weights are shared.
Defaults to False.
position_embedding_type: Position embedding type. Options ["learned_absolute", "rope"].
Defaults is 'learned_absolute'.
rotary_percent: Percent of rotary dimension to use for rotary position embeddings.
Defaults to 1.0 (100%). Ignored unless position_embedding_type is 'rope'.
"""
"""Transformer language model."""

def __init__( # noqa: D107
def __init__(
self,
config: TransformerConfig,
config: "BioBertConfig",
num_tokentypes: int,
transformer_layer_spec: ModuleSpec,
vocab_size: int,
max_sequence_length: int,
Comment on lines -155 to -157
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nearly all these arguments that we're passing around are also in the config object, which we've mis-typed as a vanilla TransformerConfig

tokenizer: Optional[AutoTokenizer] = None,
pre_process: bool = True,
post_process: bool = True,
fp16_lm_cross_entropy: bool = False,
parallel_output: bool = True,
share_embeddings_and_output_weights: bool = False,
position_embedding_type: PositionEmbeddingKinds = "learned_absolute",
rotary_percent: float = 1.0,
Expand All @@ -172,29 +150,45 @@ def __init__( # noqa: D107
include_input_ids: bool = False,
skip_logits: bool = False, # Useful for inference time.
):
"""Initialize the MegatronBioBertModel.

Args:
config (TransformerConfig): transformer config
num_tokentypes (int): Set to 2 when args.bert_binary_head is True, and 0 otherwise. Defaults to 0.
tokenizer (AutoTokenizer): optional tokenizer object (currently only used in the constructor of ESM2Model)
pre_process (bool): Include embedding layer (used with pipeline parallelism)
post_process (bool): Include an output layer (used with pipeline parallelism)
share_embeddings_and_output_weights (bool): When True, input embeddings and output logit weights are shared. Defaults to False.
position_embedding_type (string): Position embedding type. Options ['learned_absolute', 'rope'].
Defaults is 'learned_absolute'.
rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings.
Defaults to 1.0 (100%). Ignored unless position_embedding_type is 'rope'.
seq_len_interpolation_factor (Optional[float]): Interpolation factor for sequence length. Defaults to None.
add_binary_head (bool): Whether to add a binary head. Defaults to True.
return_embeddings (bool): Whether to return embeddings. Defaults to False.
include_embeddings (bool): Whether to include embeddings in the output dictionary. Defaults to False.
include_input_ids (bool): Whether to include input_ids in the output dictionary. Defaults to False.
use_full_attention_mask (bool): Whether to use full attention mask. Defaults to False.
include_hiddens (bool): Whether to include hidden states in the output dictionary. Defaults to False.
skip_logits (bool): Skip writing the token logits in output dict
"""
# TODO (@jstjohn) come up with a cleaner way for this model to return a set of things the user wants.
# hidden states, embeddings, logits, etc. The defaults should work for training but we need to make it
# customizable and easy to tell how to make it work well for inference as well as trouble shooting.
# Also make sure that everything returned that the user wants gets transposed to the b,s,h format.
super(MegatronBioBertModel, self).__init__(config=config)
self.post_process = post_process
super(MegatronBioBertModel, self).__init__(config=config.transformer_config)
self.add_binary_head = add_binary_head
self.skip_logits = skip_logits
if return_embeddings:
assert self.post_process, "only return embeddings on the last pipeline stage"
assert post_process, "only return embeddings on the last pipeline stage"
# `b` = batch, `s` = sequence.
# The old flash attention mechanism apparently wants you to use a b x 1 x s x s attention mask while
# the new one wants a b x 1 x 1 x s attention mask. This is a hack to allow us to switch between the two.
self.use_full_attention_mask = use_full_attention_mask
self.config: TransformerConfig = config
self.transformer_layer_spec: ModuleSpec = transformer_layer_spec
self.vocab_size = vocab_size
self.max_sequence_length = max_sequence_length
self.config: "BioBertConfig" = config
self.tokenizer = tokenizer
self.pre_process = pre_process
self.post_process = post_process
self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
self.parallel_output = parallel_output
self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
self.position_embedding_type = position_embedding_type
self.add_binary_head = add_binary_head
Expand All @@ -210,30 +204,30 @@ def __init__( # noqa: D107
if self.pre_process:
self.register_buffer(
"bert_position_id_tensor",
torch.arange(max_sequence_length, dtype=torch.long, requires_grad=False).unsqueeze(0),
torch.arange(self.config.seq_length, dtype=torch.long, requires_grad=False).unsqueeze(0),
persistent=False,
)
self.embedding = LanguageModelEmbedding(
config=self.config,
vocab_size=self.vocab_size,
max_sequence_length=self.max_sequence_length,
config=config.transformer_config,
vocab_size=self.config.vocab_size,
max_sequence_length=self.config.seq_length,
position_embedding_type=position_embedding_type,
num_tokentypes=num_tokentypes,
)

if self.position_embedding_type == "rope":
self.rotary_pos_emb = RotaryEmbedding(
kv_channels=self.config.kv_channels,
kv_channels=config.transformer_config.kv_channels,
rotary_percent=rotary_percent,
rotary_interleaved=self.config.rotary_interleaved,
rotary_interleaved=config.transformer_config.rotary_interleaved,
# bug in megatron: they list the type as `float` but they default to `None` so it should be `Optional[float]`
seq_len_interpolation_factor=seq_len_interpolation_factor, # type: ignore
)

# Transformer.
self.encoder = TransformerBlock(
config=self.config,
spec=self.transformer_layer_spec,
config=config.transformer_config,
spec=self.config.transformer_layer_spec,
pre_process=self.pre_process,
post_process=self.post_process, # NOTE: in bionemo1 this is hard-coded to True
)
Expand Down Expand Up @@ -263,13 +257,13 @@ def __init__( # noqa: D107

self.output_layer = tensor_parallel.ColumnParallelLinear(
config.hidden_size,
self.vocab_size,
self.config.vocab_size,
config=config,
init_method=config.init_method,
is_expert=False,
bias=True,
skip_bias_add=False,
gather_output=not self.parallel_output,
gather_output=not self.config.parallel_output,
skip_weight_param_allocation=pre_process and share_embeddings_and_output_weights,
embedding_activation_buffer=self.embedding_activation_buffer,
grad_output_buffer=self.grad_output_buffer,
Expand Down Expand Up @@ -324,7 +318,7 @@ def bert_extended_attention_mask(self, attention_mask: Tensor) -> Tensor:
def bert_position_ids(self, token_ids): # noqa: D102
# Create position ids
seq_length = token_ids.size(1)
if seq_length != self.max_sequence_length:
if seq_length != self.config.seq_length:
return self.bert_position_id_tensor[:, :seq_length]
return self.bert_position_id_tensor # No need to subset so skip the slice op

Expand Down Expand Up @@ -483,27 +477,28 @@ class BioBertConfig(
`configure_model()` is ultimately called by the LightningModule using PTL lightning module hooks.
"""

# From megatron.core.models.gpt.bert_model.GPTModel
kv_channels: int | None = None
fp16_lm_cross_entropy: bool = False
apply_rope_fusion: bool = True
transformer_config: TransformerConfig = field(
default_factory=lambda: TransformerConfig(
apply_rope_fusion=True,
bias_dropout_fusion=True,
bias_activation_fusion=True,
masked_softmax_fusion=True,
persist_layer_norm=True,
hidden_size=512,
num_attention_heads=8,
num_layers=6,
init_method_std=0.02,
)
)
parallel_output: bool = True
bias_dropout_fusion: bool = True
bias_activation_fusion: bool = True
masked_softmax_fusion: bool = True
persist_layer_norm: bool = True
get_attention_mask_from_fusion: bool = True
share_embeddings_and_output_weights: bool = False # try True

make_vocab_size_divisible_by: int = 128
position_embedding_type: PositionEmbeddingKinds = "learned_absolute"
rotary_base: int = 10000
rotary_base: int = 10_000
rotary_percent: float = 1.0
seq_len_interpolation_factor: Optional[float] = None
seq_length: int = 1024
hidden_size: int = 512
num_attention_heads: int = 8
num_layers: int = 6
init_method_std: float = 0.02

biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_with_transformer_engine_spec

optimizer_fn: Optional[Callable[["MegatronBioBertModel"], Optimizer]] = None
Expand Down Expand Up @@ -531,13 +526,26 @@ class BioBertConfig(
# loss reduction class
loss_reduction_class: Type[MegatronLossType] = BERTMLMLossWithReduction

transformer_layer_spec: ModuleSpec = field(init=False)
vocab_size: int = field(init=False)

def __post_init__(self) -> None:
"""Initialize derived config fields."""
# Don't raise an AttributeError if the parent doesn't define a __post_init__
getattr(super(), "__post_init__", lambda: None)()

self.transformer_layer_spec = get_biobert_spec(
self.biobert_spec_option,
qk_layernorm=self.transformer_config.qk_layernorm,
core_attention=self.core_attention_override,
)

def configure_model(self, tokenizer: AutoTokenizer) -> MegatronBioBertModelType: # noqa: D102
vp_size = self.virtual_pipeline_model_parallel_size
if vp_size:
p_size = self.pipeline_model_parallel_size
assert (
self.num_layers // p_size
) % vp_size == 0, "Make sure the number of model chunks is the same across all pipeline stages."
self.vocab_size = get_vocab_size(
self.transformer_config, tokenizer.vocab_size, self.make_vocab_size_divisible_by
)

_assert_equal_pipeline_parallel_chunks(self.transformer_config)

# The local specs all require the standard full attention mask.
use_full_attention_mask: bool = "transformer_engine" not in self.biobert_spec_option
Expand All @@ -552,17 +560,8 @@ def configure_model(self, tokenizer: AutoTokenizer) -> MegatronBioBertModelType:

model = self.model_cls(
self,
transformer_layer_spec=get_biobert_spec(
self.biobert_spec_option,
qk_layernorm=self.qk_layernorm,
core_attention=self.core_attention_override,
),
num_tokentypes=2 if do_next_sentence else 0,
vocab_size=get_vocab_size(self, tokenizer.vocab_size, self.make_vocab_size_divisible_by),
max_sequence_length=self.seq_length,
tokenizer=tokenizer,
fp16_lm_cross_entropy=self.fp16_lm_cross_entropy,
parallel_output=self.parallel_output,
share_embeddings_and_output_weights=self.share_embeddings_and_output_weights,
position_embedding_type=self.position_embedding_type,
rotary_percent=self.rotary_percent,
Expand Down Expand Up @@ -616,3 +615,13 @@ def configure_model(self, tokenizer: AutoTokenizer) -> MegatronBioBertModelType:
def get_loss_reduction_class(self) -> Type[MegatronLossType]: # noqa: D102
# You could optionally return a different loss reduction class here based on the config settings.
return self.loss_reduction_class


def _assert_equal_pipeline_parallel_chunks(config: TransformerConfig) -> None:
"""Ensure the number of model chunks is the same across all pipeline stages."""
vp_size = config.virtual_pipeline_model_parallel_size
if vp_size:
p_size = config.pipeline_model_parallel_size
assert (
config.num_layers // p_size
) % vp_size == 0, "Make sure the number of model chunks is the same across all pipeline stages."
3 changes: 1 addition & 2 deletions sub-packages/bionemo-llm/src/bionemo/llm/model/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from pathlib import Path
from typing import Any, Generic, List, Protocol, Sequence, Type

from megatron.core.transformer import TransformerConfig
from nemo.lightning import io

from bionemo.core.model.config import BionemoModelConfig, BionemoTrainableModelConfig
Expand Down Expand Up @@ -52,7 +51,7 @@
OVERRIDE_BIONEMO_CONFIG_DEFAULTS = deepcopy(_OVERRIDE_BIONEMO_CONFIG_DEFAULTS) # copy for export


class MegatronBioNeMoModelConfig(BionemoModelConfig[MegatronModelType], TransformerConfig, iom.WillHaveGetSetHparam):
class MegatronBioNeMoModelConfig(BionemoModelConfig[MegatronModelType], iom.WillHaveGetSetHparam):
"""A ModelConfig class for bionemo that supports usage with Megatron models, for example as NeMo2 requires."""

model_cls: Type[MegatronModelType]
Expand Down
Loading
Loading