NVlabs · Aero-Ex · Nov 14, 2025 · Nov 14, 2025 · Nov 14, 2025 · Nov 14, 2025
diff --git a/.github/workflows/build_binary_wheel.yml b/.github/workflows/build_binary_wheel.yml
@@ -0,0 +1,172 @@
+name: Build nvdiffrast Binary Wheel (Windows CUDA 12.4)
+
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+  workflow_dispatch:  # Allow manual trigger
+
+jobs:
+  build_windows_binary:
+    runs-on: windows-2022  # Has MSVC pre-installed
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install CUDA Toolkit 12.4
+        uses: Jimver/[email protected]
+        with:
+          cuda: '12.4.0'
+          method: 'network'
+          use-github-cache: false
+          use-local-cache: false
+
+      - name: Verify CUDA and Python
+        run: |
+          $ErrorActionPreference = 'Continue'
+
+          Write-Host "=== Python Version ==="
+          python --version
+
+          Write-Host "`n=== NVCC Version ==="
+          nvcc --version
+
+          Write-Host "`n=== CUDA Environment ==="
+          Write-Host "CUDA_PATH: $env:CUDA_PATH"
+
+          Write-Host "`n=== MSVC Compiler (optional check) ==="
+          $clPath = where.exe cl.exe 2>&1 | Out-Null
+          if ($LASTEXITCODE -eq 0) {
+              Write-Host "Found in PATH"
+          } else {
+              Write-Host "Not in PATH (will be auto-detected during build)"
+          }
+
+          Write-Host "`n[OK] Verification complete"
+        shell: powershell
+        continue-on-error: true
+
+      - name: Install build dependencies
+        run: |
+          pip install --upgrade pip
+          pip install wheel setuptools ninja
+        shell: powershell
+
+      - name: Install PyTorch 2.6.0 (CUDA 12.4)
+        run: |
+          pip install torch==2.6.0 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu124
+        shell: powershell
+
+      - name: Install NumPy 1.26.4
+        run: |
+          pip install numpy==1.26.4
+        shell: powershell
+
+      - name: Verify PyTorch CUDA
+        run: |
+          python -c "import torch; print(f'PyTorch: {torch.__version__}'); print(f'CUDA available: {torch.cuda.is_available()}'); print(f'CUDA version: {torch.version.cuda}')"
+        shell: powershell
+
+      - name: Build binary wheel
+        run: |
+          $ErrorActionPreference = 'Continue'
+
+          Write-Host "=== Setting up MSVC environment ==="
+          # Find and activate Visual Studio Developer environment
+          $vsPath = & "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -latest -property installationPath
+          Write-Host "Visual Studio path: $vsPath"
+
+          Import-Module "$vsPath\Common7\Tools\Microsoft.VisualStudio.DevShell.dll"
+          Enter-VsDevShell -VsInstallPath $vsPath -SkipAutomaticLocation -DevCmdArguments "-arch=x64 -host_arch=x64"
+
+          Write-Host "Verifying cl.exe is now in PATH:"
+          where.exe cl.exe
+
+          Write-Host "`n=== Building binary wheel with CUDA extensions ==="
+          $env:BUILD_BINARY_WHEEL = "1"
+          $env:TORCH_CUDA_ARCH_LIST = "6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0"
+          $env:DISTUTILS_USE_SDK = "1"
+          $env:MAX_JOBS = "2"
+
+          Write-Host "Build environment:"
+          Write-Host "  CUDA_PATH: $env:CUDA_PATH"
+          Write-Host "  BUILD_BINARY_WHEEL: $env:BUILD_BINARY_WHEEL"
+          Write-Host "  TORCH_CUDA_ARCH_LIST: $env:TORCH_CUDA_ARCH_LIST"
+          Write-Host "  MAX_JOBS: $env:MAX_JOBS"
+          Write-Host ""
+
+          python setup.py bdist_wheel > build.log 2>&1
+          $buildExitCode = $LASTEXITCODE
+
+          Write-Host "`n=== Build Output ==="
+          Get-Content build.log
+
+          if ($buildExitCode -ne 0) {
+              Write-Host "`n=== BUILD FAILED (exit code: $buildExitCode) ==="
+              exit $buildExitCode
+          }
+
+          Write-Host "`n=== Built wheel ==="
+          $wheels = Get-ChildItem dist\*.whl
+          foreach ($wheel in $wheels) {
+              Write-Host "  Wheel: $($wheel.Name)"
+              Write-Host "  Size: $([math]::Round($wheel.Length / 1MB, 2)) MB"
+          }
+        shell: powershell
+        env:
+          BUILD_BINARY_WHEEL: "1"
+          TORCH_CUDA_ARCH_LIST: "6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0"
+
+      - name: Upload wheel artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: nvdiffrast-binary-wheel-py311-cu124-win_amd64
+          path: dist/*.whl
+          retention-days: 30
+
+      - name: Test wheel installation
+        run: |
+          Write-Host "=== Creating test environment ==="
+          python -m venv test_env
+          .\test_env\Scripts\Activate.ps1
+
+          Write-Host "`n=== Installing PyTorch in test env ==="
+          pip install torch==2.6.0 --extra-index-url https://download.pytorch.org/whl/cu124
+
+          Write-Host "`n=== Installing built wheel ==="
+          $wheel = Get-ChildItem dist\*.whl | Select-Object -First 1
+          pip install $wheel.FullName
+
+          Write-Host "`n=== Testing import ==="
+          python -c "import nvdiffrast.torch as dr; print('[OK] nvdiffrast imported successfully')"
+
+          Write-Host "`n=== Testing CUDA plugin load ==="
+          python -c "from nvdiffrast.torch import ops; plugin = ops._get_plugin(); print('[OK] CUDA plugin loaded successfully')"
+
+          Write-Host "`n=== Verifying pre-compiled plugin ==="
+          python -c "import sys; import torch; import nvdiffrast_plugin; print('[OK] Pre-compiled plugin found:', nvdiffrast_plugin.__file__)"
+        shell: powershell
+
+      - name: Display build summary
+        run: |
+          Write-Host "`n=========================================="
+          Write-Host "✅ BUILD SUCCESSFUL!"
+          Write-Host "=========================================="
+          Write-Host "`nBuilt wheel:"
+          Get-ChildItem dist\*.whl | ForEach-Object {
+              Write-Host "  📦 $($_.Name)"
+              Write-Host "     Size: $([math]::Round($_.Length / 1MB, 2)) MB"
+          }
+          Write-Host "`n📥 To download:"
+          Write-Host "  1. Go to Actions tab in GitHub"
+          Write-Host "  2. Click on this workflow run"
+          Write-Host "  3. Download artifact from 'Artifacts' section"
+          Write-Host "`n✨ This is a TRUE BINARY WHEEL (cp311-cp311-win_amd64)"
+          Write-Host "   No MSVC Build Tools required for installation!"
+        shell: powershell
diff --git a/BUILD_README.md b/BUILD_README.md
@@ -0,0 +1,128 @@
+# nvdiffrast Binary Wheel Build
+
+This fork adds support for building **true binary wheels** for Windows with pre-compiled CUDA extensions.
+
+## What's Different?
+
+| Aspect | Original nvdiffrast | This Fork |
+|--------|-------------------|-----------|
+| Wheel Type | Source wheel (`py3-none-any`) | Binary wheel (`cp311-cp311-win_amd64`) |
+| Installation Requires MSVC? | ✅ YES (~7GB) | ❌ NO |
+| Compilation | JIT at runtime (1-2 min first use) | Pre-compiled (instant load) |
+| Setup | Complex | Simple `pip install` |
+
+## Changes Made
+
+### 1. Modified `setup.py`
+- Added environment variable `BUILD_BINARY_WHEEL` to trigger binary build mode
+- Uses `torch.utils.cpp_extension.CUDAExtension` to compile CUDA code during wheel build
+- Falls back to original source wheel behavior if `BUILD_BINARY_WHEEL != 1`
+
+### 2. Modified `nvdiffrast/torch/ops.py`
+- Added pre-compiled extension loader at the beginning of `_get_plugin()`
+- Tries to import `nvdiffrast_plugin` (the pre-compiled extension) first
+- Falls back to JIT compilation if pre-compiled version not found
+- Backwards compatible with source wheels
+
+### 3. Added GitHub Actions Workflow
+- `.github/workflows/build_binary_wheel.yml`
+- Automatically builds binary wheels on push
+- Installs CUDA 11.8 toolkit in CI
+- Tests the built wheel
+- Uploads as artifact
+
+## Building Binary Wheels
+
+### Using GitHub Actions (Recommended)
+
+1. Push changes to your fork
+2. GitHub Actions automatically builds the wheel
+3. Download from Actions > Artifacts
+
+### Manual Build (Requires MSVC + CUDA 11.8)
+
+```powershell
+# Install dependencies
+pip install wheel setuptools ninja
+pip install torch==2.2.2 --extra-index-url https://download.pytorch.org/whl/cu118
+pip install numpy==1.26.4
+
+# Build binary wheel
+$env:BUILD_BINARY_WHEEL = "1"
+$env:TORCH_CUDA_ARCH_LIST = "6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0"
+python setup.py bdist_wheel
+
+# Wheel will be in dist/ folder
+```
+
+## Installing Binary Wheel
+
+```bash
+# Install PyTorch first (required dependency)
+pip install torch==2.2.2 --extra-index-url https://download.pytorch.org/whl/cu118
+
+# Install the binary wheel
+pip install nvdiffrast-0.3.4-cp311-cp311-win_amd64.whl
+
+# Test it
+python -c "import nvdiffrast.torch as dr; print('Success!')"
+```
+
+## Requirements
+
+### Build Requirements (CI only)
+- Python 3.11
+- PyTorch 2.2.2 (CUDA 11.8)
+- CUDA Toolkit 11.8
+- MSVC compiler
+- NumPy 1.26.4
+
+### Runtime Requirements (End Users)
+- Python 3.11
+- PyTorch 2.2.2 (CUDA 11.8)
+- NVIDIA GPU with CUDA support
+- CUDA 11.8 runtime (included in GPU drivers)
+- **NO MSVC Build Tools required** ✅
+
+## Supported GPU Architectures
+
+The wheel includes pre-compiled code for these compute capabilities:
+- 6.0, 6.1 (GTX 10-series)
+- 7.0, 7.5 (RTX 20-series, Tesla V100/T4)
+- 8.0 (A100)
+- 8.6 (RTX 30-series)
+- 8.9 (RTX 40-series)
+- 9.0 (Future GPUs)
+
+## Backwards Compatibility
+
+The modified `setup.py` is **fully backwards compatible**:
+- Without `BUILD_BINARY_WHEEL=1`: Builds original source wheel
+- With `BUILD_BINARY_WHEEL=1`: Builds binary wheel with pre-compiled extensions
+
+The modified `ops.py` is also backwards compatible:
+- If pre-compiled extension exists: Uses it
+- If not: Falls back to JIT compilation (original behavior)
+
+## Troubleshooting
+
+### "Could not locate MSVC installation" during build
+- Use GitHub Actions (has MSVC pre-installed)
+- Or install Visual Studio Build Tools locally
+
+### "CUDA error: no kernel image is available"
+- Your GPU architecture wasn't included in `TORCH_CUDA_ARCH_LIST`
+- Rebuild with your GPU's compute capability
+
+### Import error at runtime
+- Ensure PyTorch 2.2.2 (CUDA 11.8) is installed
+- Verify CUDA runtime is available (check GPU drivers)
+
+## Credits
+
+- Original nvdiffrast: https://github.com/NVlabs/nvdiffrast
+- Binary wheel modifications: Aero-Ex
+
+## License
+
+Same as original nvdiffrast (NVIDIA Source Code License)
diff --git a/...ast/common/cudaraster/impl/RasterImpl.cpp → ...ommon/cudaraster/impl/RasterImpl_host.cpp b/...ast/common/cudaraster/impl/RasterImpl.cpp → ...ommon/cudaraster/impl/RasterImpl_host.cpp
@@ -324,12 +324,13 @@ void RasterImpl::launchStages(bool instanceMode, bool peel, cudaStream_t stream)
         p.activeTiles       = m_activeTiles.getPtr();
         p.tileFirstSeg      = m_tileFirstSeg.getPtr();
 
+        p.strideX           = m_bufferSizePixels.x;
+        p.strideY           = m_bufferSizePixels.y;
+
         size_t byteOffset = ((size_t)m_offsetPixels.x + (size_t)m_offsetPixels.y * (size_t)p.strideX) * sizeof(U32);
         p.colorBuffer       = m_colorBuffer.getPtr(byteOffset);
         p.depthBuffer       = m_depthBuffer.getPtr(byteOffset);
         p.peelBuffer        = (m_renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling) ? m_peelBuffer.getPtr(byteOffset) : 0;
-        p.strideX           = m_bufferSizePixels.x;
-        p.strideY           = m_bufferSizePixels.y;
 
         memcpy(&p.imageParamsFirst, imageParams, min(m_numImages, CR_EMBED_IMAGE_PARAMS) * sizeof(CRImageParams));
         p.imageParamsExtra  = (CRImageParams*)m_crImageParamsExtra.getPtr();

diff --git a/nvdiffrast/common/texture.cpp → nvdiffrast/common/texture_host.cpp b/nvdiffrast/common/texture.cpp → nvdiffrast/common/texture_host.cpp
diff --git a/nvdiffrast/torch/ops.py b/nvdiffrast/torch/ops.py
@@ -23,6 +23,21 @@ def _get_plugin(gl=False):
     if _cached_plugin.get(gl, None) is not None:
         return _cached_plugin[gl]
 
+    # Try to load pre-compiled extension first (for binary wheels)
+    try:
+        if gl:
+            import nvdiffrast_plugin_gl
+            _cached_plugin[gl] = nvdiffrast_plugin_gl
+        else:
+            import nvdiffrast_plugin
+            _cached_plugin[gl] = nvdiffrast_plugin
+
+        logging.getLogger('nvdiffrast').info("[OK] Loaded pre-compiled nvdiffrast plugin")
+        return _cached_plugin[gl]
+    except ImportError:
+        # Pre-compiled extension not found, fall back to JIT compilation
+        logging.getLogger('nvdiffrast').info("Pre-compiled plugin not found, using JIT compilation")
+
     # Make sure we can find the necessary compiler and libary binaries.
     if os.name == 'nt':
         lib_dir = os.path.dirname(__file__) + r"\..\lib"