Skip to content

Commit 343090c

Browse files
committed
Hardcode number of batches
Signed-off-by: sailesh duddupudi <[email protected]>
1 parent c80db23 commit 343090c

File tree

2 files changed

+6
-5
lines changed

2 files changed

+6
-5
lines changed

examples/jax/jax-dist-spmd-mnist/spmd_mnist_classifier_fromscratch.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,9 @@ def accuracy(params, batch):
104104
train_images, train_labels, test_images, test_labels = datasets.mnist()
105105
num_train = train_images.shape[0]
106106
num_complete_batches, leftover = divmod(num_train, batch_size)
107-
num_batches = num_complete_batches + bool(leftover)
107+
108+
# Increasing number of batches requires more resources.
109+
num_batches = 10
108110

109111
def data_stream():
110112
rng = npr.RandomState(0)
@@ -155,7 +157,6 @@ def replicate_array(x):
155157

156158
for epoch in range(num_epochs):
157159
start_time = time.time()
158-
num_batches = 5
159160
for _ in range(num_batches):
160161
replicated_params = spmd_update(replicated_params, next(batches))
161162
epoch_time = time.time() - start_time

sdk/python/test/e2e/test_e2e_jaxjob.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def test_sdk_e2e_with_gang_scheduling(job_namespace):
8888
logging.info(TRAINING_CLIENT.list_jobs(job_namespace))
8989

9090
try:
91-
utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=9000)
91+
utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900)
9292
except Exception as e:
9393
utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
9494
TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
@@ -123,7 +123,7 @@ def test_sdk_e2e(job_namespace):
123123
logging.info(TRAINING_CLIENT.list_jobs(job_namespace))
124124

125125
try:
126-
utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=9000)
126+
utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900)
127127
except Exception as e:
128128
utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
129129
TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
@@ -156,5 +156,5 @@ def generate_container() -> V1Container:
156156
return V1Container(
157157
name=CONTAINER_NAME,
158158
image=os.getenv("JAX_JOB_IMAGE", "docker.io/kubeflow/jaxjob-dist-spmd-mnist:latest"),
159-
# resources=V1ResourceRequirements(limits={"memory": "4Gi", "cpu": "1.6"}),
159+
resources=V1ResourceRequirements(limits={"memory": "3Gi", "cpu": "1.2"}),
160160
)

0 commit comments

Comments
 (0)