diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 2ddfe8ccb932..4a0ba78229f8 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -37,7 +37,7 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] +build_frameworks = ["sglang"] # By default we build both training and inference containers. Set true/false values to determine which to build. diff --git a/sglang/build_artifacts/dockerd_entrypoint.sh b/sglang/build_artifacts/dockerd_entrypoint.sh new file mode 100644 index 000000000000..4368672a9c4c --- /dev/null +++ b/sglang/build_artifacts/dockerd_entrypoint.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash +# Check if telemetry file exists before executing +# Execute telemetry script if it exists, suppress errors +bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true + +python3 -m sglang.launch_server "$@" \ No newline at end of file diff --git a/sglang/buildspec-ec2.yml b/sglang/buildspec-ec2.yml new file mode 100644 index 000000000000..8ebd9fb87575 --- /dev/null +++ b/sglang/buildspec-ec2.yml @@ -0,0 +1,56 @@ +account_id: &ACCOUNT_ID +prod_account_id: &PROD_ACCOUNT_ID 763104351884 +region: ®ION +framework: &FRAMEWORK sglang +version: &VERSION "0.5.6" +short_version: &SHORT_VERSION "0.5" +arch_type: &ARCH_TYPE x86_64 +autopatch_build: "False" + +repository_info: + build_repository: &BUILD_REPOSITORY + image_type: &IMAGE_TYPE gpu + root: . + repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK ] + repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] + release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK ] + release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ] + +context: + build_context: &BUILD_CONTEXT + deep_learning_container: + source: src/deep_learning_container.py + target: deep_learning_container.py + install_efa: + source: scripts/install_efa.sh + target: install_efa.sh + start_cuda_compat: + source: sglang/build_artifacts/start_cuda_compat.sh + target: start_cuda_compat.sh + sagemaker_entrypoint: + source: sglang/build_artifacts/dockerd_entrypoint.sh + target: dockerd_entrypoint.sh + +images: + sglang_ec2: + <<: *BUILD_REPOSITORY + context: + <<: *BUILD_CONTEXT + image_size_baseline: 26000 + device_type: &DEVICE_TYPE gpu + cuda_version: &CUDA_VERSION cu129 + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py312 + os_version: &OS_VERSION ubuntu22.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] + latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] + skip_build: "False" + docker_file: !join [ *FRAMEWORK, /, *ARCH_TYPE, /, *DEVICE_TYPE, /Dockerfile ] + target: sglang-ec2 + build: true + enable_common_stage_build: false + test_configs: + test_platforms: + - sanity + - security + - ec2 diff --git a/sglang/buildspec.yml b/sglang/buildspec.yml index f91a3b188954..cea0302f53a8 100644 --- a/sglang/buildspec.yml +++ b/sglang/buildspec.yml @@ -1 +1 @@ -buildspec_pointer: buildspec-sm.yml +buildspec_pointer: buildspec-ec2.yml diff --git a/sglang/x86_64/gpu/Dockerfile b/sglang/x86_64/gpu/Dockerfile index 5e98d6abbd8e..49e4c203bb89 100644 --- a/sglang/x86_64/gpu/Dockerfile +++ b/sglang/x86_64/gpu/Dockerfile @@ -83,6 +83,24 @@ RUN echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc \ && rm -rf /var/lib/apt/lists/* \ && rm -rf /root/.cache | true +# ======================================================= +# ====================== EC2 ============================ +# ======================================================= + +FROM base AS sglang-ec2 + +RUN dpkg -l | grep -E "cuda|nvidia|libnv" | awk '{print $2}' | xargs apt-mark hold \ + && apt-get update \ + && apt-get upgrade -y \ + && apt-get clean + +RUN rm -rf /tmp/* + +COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh +RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh + +ENTRYPOINT ["/usr/local/bin/dockerd_entrypoint.sh"] + # ======================================================= # ====================== sagemaker ====================== # ======================================================= diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py index 7e7522995fca..9f54a4995d56 100644 --- a/test/dlc_tests/conftest.py +++ b/test/dlc_tests/conftest.py @@ -109,6 +109,8 @@ "pytorch_trcomp_training", # Autogluon "autogluon_training", + # SGLang + "sglang", # Processor fixtures "gpu", "cpu", diff --git a/test/dlc_tests/ec2/sglang/__init__.py b/test/dlc_tests/ec2/sglang/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/test/dlc_tests/ec2/sglang/ec2_tests/__init__.py b/test/dlc_tests/ec2/sglang/ec2_tests/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/test/dlc_tests/ec2/sglang/ec2_tests/test_sglang_ec2.py b/test/dlc_tests/ec2/sglang/ec2_tests/test_sglang_ec2.py new file mode 100644 index 000000000000..d30540908e85 --- /dev/null +++ b/test/dlc_tests/ec2/sglang/ec2_tests/test_sglang_ec2.py @@ -0,0 +1,310 @@ +""" +SGLang EC2 Tests +Tests SGLang inference capabilities on EC2 instances +""" + +import json +import logging +import os +import sys +import time + +import boto3 +import pytest +from botocore.exceptions import ClientError + +from test.test_utils import get_account_id_from_image_uri, get_framework_and_version_from_tag +from test.test_utils.ec2 import login_to_ecr_registry + +# Setup logging +LOGGER = logging.getLogger(__name__) +LOGGER.addHandler(logging.StreamHandler(sys.stdout)) +LOGGER.setLevel(logging.INFO) + +# Test constants +SGLANG_EC2_GPU_INSTANCE_TYPE = "g5.2xlarge" +SGLANG_EC2_LARGE_GPU_INSTANCE_TYPE = "g6e.xlarge" +DATASET_URL = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json" +DEFAULT_REGION = "us-west-2" + + +def get_hf_token_from_secrets_manager(): + """ + Retrieve HuggingFace token from AWS Secrets Manager + + Returns: + str: HuggingFace token or empty string if not found + """ + secret_name = "test/hf_token" + region_name = "us-west-2" + + try: + session = boto3.session.Session() + client = session.client(service_name="secretsmanager", region_name=region_name) + get_secret_value_response = client.get_secret_value(SecretId=secret_name) + response = json.loads(get_secret_value_response["SecretString"]) + hf_token = response.get("HF_TOKEN", "") + return hf_token + except ClientError as e: + LOGGER.warning(f"Failed to retrieve HF_TOKEN from Secrets Manager: {e}") + return "" + except Exception as e: + LOGGER.warning(f"Unexpected error retrieving HF_TOKEN: {e}") + return "" + + +def setup_docker_image(ec2_connection, image_uri): + """ + Pull SGLang Docker image from ECR + + Args: + ec2_connection: Fabric connection to EC2 instance + image_uri: Docker image URI + """ + account_id = get_account_id_from_image_uri(image_uri) + login_to_ecr_registry(ec2_connection, account_id, DEFAULT_REGION) + LOGGER.info(f"Pulling SGLang image: {image_uri}") + ec2_connection.run(f"docker pull {image_uri}", hide="out") + + +def setup_dataset(ec2_connection): + """ + Download ShareGPT dataset for benchmarking + + Args: + ec2_connection: Fabric connection to EC2 instance + """ + ec2_connection.run("mkdir -p /tmp/dataset") + + dataset_check = ec2_connection.run( + "test -f /tmp/dataset/ShareGPT_V3_unfiltered_cleaned_split.json && echo 'exists' || echo 'missing'", + hide=True, + ) + + if "missing" in dataset_check.stdout: + LOGGER.info("Downloading ShareGPT dataset...") + ec2_connection.run(f"wget -P /tmp/dataset {DATASET_URL}") + else: + LOGGER.info("ShareGPT dataset already exists, skipping download") + + +def cleanup_containers(ec2_connection): + """ + Cleanup all Docker containers + + Args: + ec2_connection: Fabric connection to EC2 instance + """ + try: + LOGGER.info("Cleaning up Docker containers...") + commands = [ + "docker ps -aq | xargs -r docker stop", + "docker ps -aq | xargs -r docker rm", + ] + for cmd in commands: + ec2_connection.run(cmd, hide=True, warn=True) + except Exception as e: + LOGGER.warning(f"Cleanup warning: {e}") + + +@pytest.mark.model("Qwen3-0.6B") +@pytest.mark.processor("gpu") +@pytest.mark.parametrize("ec2_instance_type", [SGLANG_EC2_GPU_INSTANCE_TYPE], indirect=True) +def test_sglang_ec2_local_benchmark(ec2_connection, sglang): + """ + Test SGLang local benchmark on EC2 using ShareGPT dataset + + This test validates: + - SGLang server startup with Qwen3-0.6B model + - Benchmark execution with 1000 prompts + - Basic inference capabilities + + Args: + ec2_connection: Fabric connection to EC2 instance + sglang: SGLang Docker image URI + """ + try: + LOGGER.info("\n" + "=" * 80) + LOGGER.info("Starting SGLang EC2 Local Benchmark Test") + LOGGER.info("=" * 80 + "\n") + + # Setup + setup_docker_image(ec2_connection, sglang) + setup_dataset(ec2_connection) + + # Get HuggingFace token + hf_token = os.environ.get("HF_TOKEN", "") + + # Start SGLang container + container_name = "sglang_benchmark" + container_cmd = f""" + docker run -d --name {container_name} --rm --gpus=all \ + -v /home/ubuntu/.cache/huggingface:/root/.cache/huggingface \ + -v /tmp/dataset:/dataset \ + -p 30000:30000 \ + -e HF_TOKEN={hf_token} \ + {sglang} \ + --model-path Qwen/Qwen3-0.6B \ + --reasoning-parser qwen3 \ + --host 0.0.0.0 \ + --port 30000 + """ + + LOGGER.info("Starting SGLang server container...") + ec2_connection.run(container_cmd) + + # Wait for server startup + LOGGER.info("Waiting for server startup (120s)...") + time.sleep(120) + + # Check container logs + LOGGER.info("Container logs:") + ec2_connection.run(f"docker logs {container_name}") + + # Run benchmark + LOGGER.info("Running SGLang benchmark...") + benchmark_cmd = f""" + docker exec {container_name} python3 -m sglang.bench_serving \ + --backend sglang \ + --host 0.0.0.0 --port 30000 \ + --num-prompts 1000 \ + --model Qwen/Qwen3-0.6B \ + --dataset-name sharegpt \ + --dataset-path /dataset/ShareGPT_V3_unfiltered_cleaned_split.json + """ + + result = ec2_connection.run(benchmark_cmd) + + if result.return_code == 0: + LOGGER.info("\n✓ SGLang local benchmark test passed successfully") + else: + LOGGER.error(f"\n✗ Benchmark test failed with return code {result.return_code}") + raise AssertionError(f"Benchmark test failed with return code {result.return_code}") + + finally: + cleanup_containers(ec2_connection) + + +@pytest.mark.model("Llama-3.1-8B") +@pytest.mark.processor("gpu") +@pytest.mark.parametrize("ec2_instance_type", [SGLANG_EC2_LARGE_GPU_INSTANCE_TYPE], indirect=True) +def test_sglang_ec2_upstream(ec2_connection, sglang): + """ + Test SGLang upstream test suite on EC2 + + This test validates: + - Compatibility with SGLang upstream test suite (stage-a-test-1) + - Support for gated models (Llama-3.1-8B) + - Test execution in containerized environment + + Note: Uses g6e.xlarge (1x L40S 48GB GPU) to accommodate Llama-3.1-8B model + + Args: + ec2_connection: Fabric connection to EC2 instance + sglang: SGLang Docker image URI + """ + try: + LOGGER.info("\n" + "=" * 80) + LOGGER.info("Starting SGLang EC2 Upstream Test") + LOGGER.info("=" * 80 + "\n") + + # Setup + setup_docker_image(ec2_connection, sglang) + + # Get HuggingFace token from AWS Secrets Manager + LOGGER.info("Retrieving HF_TOKEN from AWS Secrets Manager...") + hf_token = get_hf_token_from_secrets_manager() + + if not hf_token: + # Fallback to environment variable + hf_token = os.environ.get("HF_TOKEN", "") + if hf_token: + LOGGER.info("Using HF_TOKEN from environment variable") + else: + pytest.skip( + "HF_TOKEN not found in Secrets Manager or environment. Skipping test requiring gated models." + ) + + # Get SGLang version dynamically from image URI + _, sglang_version = get_framework_and_version_from_tag(sglang) + LOGGER.info(f"Detected SGLang version: {sglang_version}") + + # Clone SGLang source + LOGGER.info("Cloning SGLang source repository...") + ec2_connection.run("rm -rf /tmp/sglang_source", warn=True) + ec2_connection.run( + f"git clone --branch v{sglang_version} --depth 1 " + f"https://github.com/sgl-project/sglang.git /tmp/sglang_source" + ) + + # Start container with bash entrypoint + container_name = "sglang_upstream" + container_cmd = f""" + docker run -d --name {container_name} --rm --gpus=all \ + --user root \ + --entrypoint /bin/bash \ + -v /home/ec2-user/.cache/huggingface:/root/.cache/huggingface \ + -v /tmp/sglang_source:/workdir \ + --workdir /workdir \ + -e HF_TOKEN={hf_token} \ + -e HUGGINGFACE_HUB_TOKEN={hf_token} \ + {sglang} \ + -c "tail -f /dev/null" + """ + + LOGGER.info("Starting SGLang container with bash entrypoint...") + ec2_connection.run(container_cmd) + + # Verify HF token is available + LOGGER.info("Verifying HuggingFace token...") + ec2_connection.run( + f"""docker exec -u root {container_name} bash -c ' + env | grep -E "HF_TOKEN|HUGGINGFACE_HUB_TOKEN" || echo "No HF tokens found" + '""", + warn=True, + ) + + # Install test dependencies + LOGGER.info("Installing SGLang test dependencies...") + ec2_connection.run( + f"docker exec -u root {container_name} bash scripts/ci/ci_install_dependency.sh" + ) + + # Authenticate with HuggingFace for gated models + LOGGER.info("Authenticating with HuggingFace for gated model access...") + ec2_connection.run( + f"docker exec -u root {container_name} huggingface-cli login --token {hf_token}" + ) + LOGGER.info("✓ Successfully authenticated with HuggingFace") + + # Check GPU availability + LOGGER.info("Checking GPU availability:") + ec2_connection.run(f"docker exec -u root {container_name} nvidia-smi") + + # Run upstream test suite + LOGGER.info("Running SGLang upstream test suite (stage-a-test-1)...") + test_cmd = f""" + docker exec -u root {container_name} sh -c ' + set -eux + cd /workdir/test + python3 run_suite.py --hw cuda --suite stage-a-test-1 + ' + """ + + result = ec2_connection.run(test_cmd) + + # Capture logs if test fails + if result.return_code != 0: + LOGGER.error("Capturing container logs for debugging...") + ec2_connection.run(f"docker logs {container_name} --tail 200", warn=True) + + if result.return_code == 0: + LOGGER.info("\n✓ SGLang upstream test passed successfully") + else: + LOGGER.error(f"\n✗ Upstream test failed with return code {result.return_code}") + raise AssertionError(f"Upstream test failed with return code {result.return_code}") + + finally: + cleanup_containers(ec2_connection) + # Run as sudo since files may have been created by root in container + ec2_connection.run("sudo rm -rf /tmp/sglang_source", warn=True) diff --git a/test/testrunner.py b/test/testrunner.py index eb75c5bff1cb..dfcfae81b5c2 100644 --- a/test/testrunner.py +++ b/test/testrunner.py @@ -297,11 +297,23 @@ def main(): "functionality_sanity", "security_sanity", "sagemaker", + "ec2", }: LOGGER.info( f"NOTE: {specific_test_type} tests not supported on sglang images. Skipping..." ) return + + # Skip telemetry tests for sglang in all contexts (PR and MAINLINE) + is_sglang_image = all("sglang" in image_uri for image_uri in all_image_list) + if is_sglang_image and specific_test_type == "telemetry": + LOGGER.info( + f"NOTE: {specific_test_type} tests not supported on SGLang Containers. Skipping..." + ) + report = os.path.join(os.getcwd(), "test", f"{test_type}.xml") + sm_utils.generate_empty_report(report, test_type, "sglang") + return + # quick_checks tests don't have images in it. Using a placeholder here for jobs like that try: framework, version = get_framework_and_version_from_tag(all_image_list[0]) @@ -386,7 +398,7 @@ def main(): if specific_test_type in ["eks", "ec2"] and not is_all_images_list_eia: frameworks_in_images = [ framework - for framework in ("mxnet", "pytorch", "tensorflow", "vllm") + for framework in ("mxnet", "pytorch", "tensorflow", "vllm", "sglang") if framework in dlc_images ] if len(frameworks_in_images) != 1: @@ -400,6 +412,8 @@ def main(): run_vllm_tests(f"{specific_test_type}", all_image_list, new_test_structure_enabled) return + # set up EKS cluster for EKS tests. + if specific_test_type == "eks": eks_cluster_name = f"dlc-{framework}-{build_context}" eks_utils.eks_setup() if eks_utils.is_eks_cluster_active(eks_cluster_name): @@ -420,7 +434,12 @@ def main(): f"--junitxml={report}", "-n=auto", ] - if specified_tests: + + # Skip telemetry tests for sglang images + if is_sglang_image and specific_test_type == "ec2": + pytest_cmd.extend(["-k", "not telemetry"]) + LOGGER.info("Excluding telemetry tests from sglang ec2 suite") + elif specified_tests: test_expr = " or ".join(f"test_{t}" for t in specified_tests) pytest_cmd.extend(["-k", f"({test_expr})"])