feat(paged-attention): integrate vllm metal paged attention v1 #732
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: CI | |
| on: | |
| pull_request: | |
| branches: [ main ] | |
| jobs: | |
| test: | |
| name: Test on ${{ matrix.os }} | |
| runs-on: ${{ matrix.os }} | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: | |
| - os: ubuntu-latest | |
| python-version: '3.11' | |
| # Install gpu (sglang) and dev dependencies for Linux | |
| extras: 'gpu, dev' | |
| - os: macos-latest | |
| python-version: '3.11' | |
| # Install mac (mlx) and dev dependencies for macOS | |
| extras: 'mac, dev' | |
| steps: | |
| - name: Free Disk Space (Ubuntu) | |
| if: runner.os == 'Linux' | |
| uses: jlumbroso/[email protected] | |
| with: | |
| tool-cache: false | |
| android: true | |
| dotnet: true | |
| haskell: true | |
| large-packages: true | |
| docker-images: true | |
| swap-storage: false | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Check for file changes | |
| id: changes | |
| uses: dorny/paths-filter@v2 | |
| with: | |
| filters: | | |
| src: | |
| - 'src/**' | |
| - 'tests/**' | |
| - 'pyproject.toml' | |
| - name: Set up Python ${{ matrix.python-version }} | |
| if: steps.changes.outputs.src == 'true' | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: ${{ matrix.python-version }} | |
| - name: Cache pip dependencies | |
| if: steps.changes.outputs.src == 'true' | |
| uses: actions/cache@v4 | |
| with: | |
| path: ~/.cache/pip | |
| key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('pyproject.toml') }} | |
| restore-keys: | | |
| ${{ runner.os }}-pip-${{ matrix.python-version }}- | |
| ${{ runner.os }}-pip- | |
| - name: Install dependencies | |
| if: steps.changes.outputs.src == 'true' | |
| run: | | |
| python -m pip install --upgrade pip | |
| # Install extras dependencies based on matrix variable | |
| pip install -e ".[${{ matrix.extras }}]" | |
| - name: Run Unit Tests (macOS only) | |
| if: steps.changes.outputs.src == 'true' && runner.os == 'macOS' | |
| shell: bash | |
| run: | | |
| pytest tests/ -v --cov=src/parallax --cov-report=xml | |
| - name: Upload coverage to Codecov | |
| if: steps.changes.outputs.src == 'true' && runner.os == 'macOS' | |
| uses: codecov/codecov-action@v4 | |
| with: | |
| file: ./coverage.xml | |
| fail_ci_if_error: false | |
| token: ${{ secrets.CODECOV_TOKEN }} | |
| - name: Run E2E tests (macOS only) | |
| if: steps.changes.outputs.src == 'true' && runner.os == 'macOS' | |
| shell: bash | |
| run: | | |
| # Start the server | |
| python src/parallax/launch.py \ | |
| --model-path Qwen/Qwen3-0.6B \ | |
| --max-num-tokens-per-batch 16384 \ | |
| --kv-block-size 1024 \ | |
| --max-batch-size 128 \ | |
| --start-layer 0 \ | |
| --end-layer 28 & | |
| PID=$! | |
| echo "Waiting for server to start..." | |
| # Poll to check if the port is ready (wait up to 60 seconds) | |
| for i in {1..30}; do | |
| # If curl succeeds (200) or returns 405 (Method Not Allowed), the port is open | |
| if curl -s -o /dev/null -w "%{http_code}" http://localhost:3000/v1/chat/completions | grep -qE "200|400|405"; then | |
| echo "Server is up!" | |
| break | |
| fi | |
| # Check if the process is still alive | |
| if ! kill -0 $PID 2>/dev/null; then | |
| echo "Server process died prematurely" | |
| exit 1 | |
| fi | |
| if [ $i -eq 30 ]; then | |
| echo "Server failed to start within 60 seconds" | |
| kill $PID 2>/dev/null | |
| exit 1 | |
| fi | |
| sleep 2 | |
| done | |
| echo "Sending test request..." | |
| # Capture the response | |
| RESPONSE=$(curl --fail --silent --show-error --location 'http://localhost:3000/v1/chat/completions' \ | |
| --header 'Content-Type: application/json' \ | |
| --data '{ | |
| "messages": [ | |
| { | |
| "role": "user", | |
| "content": "What is the capital of France" | |
| } | |
| ], | |
| "stream": false, | |
| "max_tokens": 1024, | |
| "chat_template_kwargs": {"enable_thinking": false}, | |
| "sampling_params": { | |
| "top_k": 3 | |
| } | |
| }') | |
| echo "Response received:" | |
| echo "$RESPONSE" | |
| # Check if the response contains "Paris" (case-insensitive) | |
| if echo "$RESPONSE" | grep -iq "Paris"; then | |
| echo "Test passed: Response contains 'Paris'" | |
| else | |
| echo "Test failed: Response does not contain 'Paris'" | |
| kill $PID 2>/dev/null || true | |
| exit 1 | |
| fi | |
| # Clean up process | |
| kill $PID 2>/dev/null || true |