[PyTorch][Training][SageMaker] PyTorch 2.5.1 Currency Release (#4423)

Yadan-Wei · Yadan Wei · web-flow · commit f3f70fa97e3c · 2024-11-18T12:02:19.000-08:00
* add sm build spec

* fix allowlist

* add more vuln in allowlist

* oin cloudpickle

* run all tests

* revert toml

* fix toml test name

* remove blank line

---------

Co-authored-by: Yadan Wei &lt;yadanwei@amazon.com&gt;
diff --git a/pytorch/training/buildspec-2-5-sm.yml b/pytorch/training/buildspec-2-5-sm.yml
@@ -0,0 +1,66 @@
+account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
+prod_account_id: &PROD_ACCOUNT_ID 763104351884
+region: &REGION <set-$REGION-in-environment>
+framework: &FRAMEWORK pytorch
+version: &VERSION 2.5.1
+short_version: &SHORT_VERSION "2.5"
+arch_type: x86
+# autopatch_build: "True"
+
+repository_info:
+  training_repository: &TRAINING_REPOSITORY
+    image_type: &TRAINING_IMAGE_TYPE training
+    root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
+    repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
+    repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
+    release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
+    release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]
+
+context:
+  training_context: &TRAINING_CONTEXT
+    changehostname:
+      source: docker/build_artifacts/changehostname.c
+      target: changehostname.c
+    start_with_right_hostname:
+      source: docker/build_artifacts/start_with_right_hostname.sh
+      target: start_with_right_hostname.sh
+    example_mnist_file:
+      source: docker/build_artifacts/mnist.py
+      target: mnist.py
+    deep_learning_container:
+      source: ../../src/deep_learning_container.py
+      target: deep_learning_container.py
+
+images:
+  BuildSageMakerCPUPTTrainPy3DockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &PYTORCH_CPU_TRAINING_PY3 false
+    image_size_baseline: 6200
+    device_type: &DEVICE_TYPE cpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py311
+    os_version: &OS_VERSION ubuntu22.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
+    latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
+    # build_tag_override: "beta:2.5.1-cpu-py311-ubuntu22.04-sagemaker"
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
+    target: sagemaker
+    context:
+      <<: *TRAINING_CONTEXT
+  BuildSageMakerGPUPTTrainPy3DockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &PYTORCH_GPU_TRAINING_PY3 false
+    image_size_baseline: 21500
+    device_type: &DEVICE_TYPE gpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py311
+    cuda_version: &CUDA_VERSION cu124
+    os_version: &OS_VERSION ubuntu22.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
+    latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
+    # build_tag_override: "beta:2.5.1-gpu-py311-cu124-ubuntu22.04-sagemaker"
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
+                         *DEVICE_TYPE ]
+    target: sagemaker
+    context:
+      <<: *TRAINING_CONTEXT
diff --git a/pytorch/training/buildspec.yml b/pytorch/training/buildspec.yml
@@ -1 +1 @@
-buildspec_pointer: buildspec-2-5-ec2.yml
+buildspec_pointer: buildspec-2-5-sm.yml
diff --git a/pytorch/training/docker/2.5/py3/Dockerfile.cpu b/pytorch/training/docker/2.5/py3/Dockerfile.cpu
@@ -310,7 +310,7 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \
     scikit-learn \
     seaborn \
     shap \
-    # pinned for sagemaker==2.232.2
+    # pinned for sagemaker==2.233.0
     "cloudpickle==2.2.1" \
  && /opt/conda/bin/mamba clean -afy
 
diff --git a/pytorch/training/docker/2.5/py3/Dockerfile.sagemaker.cpu.py_scan_allowlist.json b/pytorch/training/docker/2.5/py3/Dockerfile.sagemaker.cpu.py_scan_allowlist.json
@@ -0,0 +1,11 @@
+{
+  "70612": "[pkg: jinja2] In Jinja2, the from_string function is prone to Server Side Template Injection (SSTI) where it takes the source parameter as a template object, renders it, and then returns it. The attacker can exploit it with INJECTION COMMANDS in a URI. \r\nNOTE: The maintainer and multiple third parties believe that this vulnerability isn't valid because users shouldn't use untrusted templates without sandboxing, reason_to_ignore='N/A', spec='>=0'",
+  "71584": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in versions of the MLflow platform affected versions, enabling a maliciously uploaded LightGBM scikit-learn model to run arbitrary code on an end user’s system when interacted with.', reason_to_ignore='N/A', spec='>=1.23.0'",
+  "71693": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in affected versions of the MLflow platform, enabling a maliciously uploaded pmdarima model to run arbitrary code on an end user’s system when interacted with.', reason_to_ignore='N/A', spec='>=1.24.0'",
+  "71692": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in affected versions of the MLflow platform, enabling a maliciously uploaded Tensorflow model to run arbitrary code on an end user’s system when interacted with.', reason_to_ignore='N/A', spec='>=2.0.0rc0'",
+  "71587": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in affected versions of the MLflow platform, enabling a maliciously uploaded PyFunc model to run arbitrary code on an end user’s system when interacted with.', reason_to_ignore='N/A', spec='>=0.9.0'",
+  "71589": "[pkg: mlflow] Required by sagemaker. advisory='A path traversal vulnerability exists in mlflow/mlflow version 2.9.2, allowing attackers to access arbitrary files on the server. By crafting a series of HTTP POST requests with specially crafted 'artifact_location' and 'source' parameters, using a local URI with '#' instead of '?', an attacker can traverse the server's directory structure. The issue occurs due to insufficient validation of user-supplied input in the server's handlers.', reason_to_ignore='N/A', spec='>=2.9.2'",
+  "71577": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in versions of the MLflow platform running version 1.1.0 or newer, enabling a maliciously uploaded scikit-learn model to run arbitrary code on an end user’s system when interacted with.', reason_to_ignore='N/A', spec='>=1.1.0'",
+  "71578": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in versions of the MLflow platform running version 1.1.0 or newer, enabling a maliciously uploaded scikit-learn model to run arbitrary code on an end user’s system when interacted with.', reason_to_ignore='N/A', spec='>=1.1.0'",
+  "71579": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in versions of the MLflow platform running version 1.27.0 or newer, enabling a maliciously crafted Recipe to execute arbitrary code on an end user’s system when run.', reason_to_ignore='N/A', spec='>=1.27.0'"
+}
diff --git a/pytorch/training/docker/2.5/py3/cu124/Dockerfile.gpu b/pytorch/training/docker/2.5/py3/cu124/Dockerfile.gpu
@@ -459,7 +459,8 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \
     shap \
     scikit-learn \
     seaborn \
-    cloudpickle \
+    # pinned for sagemaker==2.233.0
+    "cloudpickle==2.2.1" \
  && /opt/conda/bin/mamba clean -afy
 
 # Copy workaround script for incorrect hostname
diff --git a/pytorch/training/docker/2.5/py3/cu124/Dockerfile.sagemaker.gpu.py_scan_allowlist.json b/pytorch/training/docker/2.5/py3/cu124/Dockerfile.sagemaker.gpu.py_scan_allowlist.json
@@ -0,0 +1,11 @@
+{
+  "70612": "[pkg: jinja2] In Jinja2, the from_string function is prone to Server Side Template Injection (SSTI) where it takes the source parameter as a template object, renders it, and then returns it. The attacker can exploit it with INJECTION COMMANDS in a URI. \r\nNOTE: The maintainer and multiple third parties believe that this vulnerability isn't valid because users shouldn't use untrusted templates without sandboxing, reason_to_ignore='N/A', spec='>=0'",
+  "71584": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in versions of the MLflow platform affected versions, enabling a maliciously uploaded LightGBM scikit-learn model to run arbitrary code on an end user’s system when interacted with.', reason_to_ignore='N/A', spec='>=1.23.0'",
+  "71693": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in affected versions of the MLflow platform, enabling a maliciously uploaded pmdarima model to run arbitrary code on an end user’s system when interacted with.', reason_to_ignore='N/A', spec='>=1.24.0'",
+  "71692": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in affected versions of the MLflow platform, enabling a maliciously uploaded Tensorflow model to run arbitrary code on an end user’s system when interacted with.', reason_to_ignore='N/A', spec='>=2.0.0rc0'",
+  "71587": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in affected versions of the MLflow platform, enabling a maliciously uploaded PyFunc model to run arbitrary code on an end user’s system when interacted with.', reason_to_ignore='N/A', spec='>=0.9.0'",
+  "71589": "[pkg: mlflow] Required by sagemaker. advisory='A path traversal vulnerability exists in mlflow/mlflow version 2.9.2, allowing attackers to access arbitrary files on the server. By crafting a series of HTTP POST requests with specially crafted 'artifact_location' and 'source' parameters, using a local URI with '#' instead of '?', an attacker can traverse the server's directory structure. The issue occurs due to insufficient validation of user-supplied input in the server's handlers.', reason_to_ignore='N/A', spec='>=2.9.2'",
+  "71577": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in versions of the MLflow platform running version 1.1.0 or newer, enabling a maliciously uploaded scikit-learn model to run arbitrary code on an end user’s system when interacted with.', reason_to_ignore='N/A', spec='>=1.1.0'",
+  "71578": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in versions of the MLflow platform running version 1.1.0 or newer, enabling a maliciously uploaded scikit-learn model to run arbitrary code on an end user’s system when interacted with.', reason_to_ignore='N/A', spec='>=1.1.0'",
+  "71579": "[pkg: mlflow] Required by sagemaker. advisory='Deserialization of untrusted data can occur in versions of the MLflow platform running version 1.27.0 or newer, enabling a maliciously crafted Recipe to execute arbitrary code on an end user’s system when run.', reason_to_ignore='N/A', spec='>=1.27.0'"
+}

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-buildspec_pointer: buildspec-2-5-ec2.yml`
	`1`	`+buildspec_pointer: buildspec-2-5-sm.yml`