diff --git a/singularity/Singularity.rag b/singularity/Singularity.rag index 40ad3d4..7457869 100644 --- a/singularity/Singularity.rag +++ b/singularity/Singularity.rag @@ -15,11 +15,13 @@ mkdir -p /app cd /app apt-get update && apt-get install -y --no-install-recommends git wget curl build-essential ca-certificates && rm -rf /var/lib/apt/lists/* pip3 install --no-cache-dir chromadb "fastapi>=0.110,<0.112" "uvicorn[standard]>=0.29,<0.31" "transformers>=4.41,<4.45" "sentence-transformers>=2.6,<3" "chromadb==0.5.4" "watchdog>=3,<5" "pypdf>=4,<5" requests httpx pyyaml + HF_HOME=/root/.cache/huggingface chmod +x singularity-entrypoint.sh %environment export HF_HOME=/root/.cache/huggingface +export LD_LIBRARY_PATH=/opt/conda/lib:/usr/local/lib:/usr/lib/x86_64-linux-gnu:/usr/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} %runscript /bin/bash singularity-entrypoint.sh \ No newline at end of file diff --git a/singularity/Singularity.vllm b/singularity/Singularity.vllm index d24f421..844c477 100644 --- a/singularity/Singularity.vllm +++ b/singularity/Singularity.vllm @@ -12,6 +12,12 @@ From: vllm/vllm-openai:latest pkg-config cmake ninja-build rm -rf /var/lib/apt/lists/* + # FIPS workaround: opencv-python-headless >=4.13 bundles a FIPS-enabled + # OpenSSL 1.1.1k from CentOS/RHEL that crashes on FIPS-enabled hosts. + # Pin to 4.12.0.88 which does not bundle OpenSSL. + # See: https://github.com/opencv/opencv-python/issues/1184 + pip install opencv-python-headless==4.12.0.88 + # Pick a Python interpreter that actually exists in the base image if command -v python >/dev/null 2>&1; then PY=python @@ -37,6 +43,9 @@ print("Transformers:", transformers.__version__) print("vLLM:", vllm.__version__) PY +%environment + export LD_LIBRARY_PATH=/usr/local/lib:/usr/lib/x86_64-linux-gnu:/usr/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} + %runscript mkdir -p /app cd /app diff --git a/workflow.yaml b/workflow.yaml index f1b6a2f..688a738 100644 --- a/workflow.yaml +++ b/workflow.yaml @@ -468,10 +468,48 @@ jobs: REPO_URL="https://huggingface.co/$MODEL_ID" [[ -n "$HF_TOKEN" ]] && REPO_URL="https://user:${HF_TOKEN}@huggingface.co/$MODEL_ID" - # Clone with LFS and pull large files - GIT_LFS_SKIP_SMUDGE=1 git clone --depth 1 "$REPO_URL" "$TARGET_DIR" + # Clone without LFS or checkout, then sparse-checkout only safetensors + GIT_LFS_SKIP_SMUDGE=1 git clone --depth 1 --no-checkout "$REPO_URL" "$TARGET_DIR" cd "$TARGET_DIR" - git lfs pull + + # Sparse checkout: exclude bin/gguf/onnx/pth to only get safetensors weights + git sparse-checkout init --no-cone + git sparse-checkout set '/*' '!*.bin' '!*.gguf' '!*.onnx' '!consolidated*.pth' + git checkout + + # Configure LFS for reliability with large models + git lfs install --local + git config lfs.concurrenttransfers 4 + git config lfs.transfer.maxretries 10 + git config lfs.transfer.maxretrydelay 30 + + # Fetch only safetensors LFS objects with progress and retry logic + attempt=1 + max_attempts=3 + while true; do + echo "LFS fetch attempt ${attempt}/${max_attempts}..." + if git lfs fetch --progress --include="*.safetensors"; then + break + fi + if [[ $attempt -ge $max_attempts ]]; then + echo "ERROR: LFS fetch failed after ${max_attempts} attempts" + exit 1 + fi + sleep $((attempt * 5)) + ((attempt++)) + done + echo "Checking out LFS files..." + git lfs checkout --include="*.safetensors" + + # Fallback: if model only ships bin weights, re-include and fetch those + if ! ls *.safetensors 1>/dev/null 2>&1; then + echo "No safetensors files found, falling back to bin weights..." + git sparse-checkout set '/*' + git checkout + git lfs fetch --progress --include="*.bin" + git lfs checkout --include="*.bin" + fi + cd .. # Verify model weights exist (not just LFS pointers) diff --git a/yamls/hsp.yaml b/yamls/hsp.yaml index e5180ef..8285aa3 100644 --- a/yamls/hsp.yaml +++ b/yamls/hsp.yaml @@ -468,10 +468,48 @@ jobs: REPO_URL="https://huggingface.co/$MODEL_ID" [[ -n "$HF_TOKEN" ]] && REPO_URL="https://user:${HF_TOKEN}@huggingface.co/$MODEL_ID" - # Clone with LFS and pull large files - GIT_LFS_SKIP_SMUDGE=1 git clone --depth 1 "$REPO_URL" "$TARGET_DIR" + # Clone without LFS or checkout, then sparse-checkout only safetensors + GIT_LFS_SKIP_SMUDGE=1 git clone --depth 1 --no-checkout "$REPO_URL" "$TARGET_DIR" cd "$TARGET_DIR" - git lfs pull + + # Sparse checkout: exclude bin/gguf/onnx/pth to only get safetensors weights + git sparse-checkout init --no-cone + git sparse-checkout set '/*' '!*.bin' '!*.gguf' '!*.onnx' '!consolidated*.pth' + git checkout + + # Configure LFS for reliability with large models + git lfs install --local + git config lfs.concurrenttransfers 4 + git config lfs.transfer.maxretries 10 + git config lfs.transfer.maxretrydelay 30 + + # Fetch only safetensors LFS objects with progress and retry logic + attempt=1 + max_attempts=3 + while true; do + echo "LFS fetch attempt ${attempt}/${max_attempts}..." + if git lfs fetch --progress --include="*.safetensors"; then + break + fi + if [[ $attempt -ge $max_attempts ]]; then + echo "ERROR: LFS fetch failed after ${max_attempts} attempts" + exit 1 + fi + sleep $((attempt * 5)) + ((attempt++)) + done + echo "Checking out LFS files..." + git lfs checkout --include="*.safetensors" + + # Fallback: if model only ships bin weights, re-include and fetch those + if ! ls *.safetensors 1>/dev/null 2>&1; then + echo "No safetensors files found, falling back to bin weights..." + git sparse-checkout set '/*' + git checkout + git lfs fetch --progress --include="*.bin" + git lfs checkout --include="*.bin" + fi + cd .. # Verify model weights exist (not just LFS pointers)