fix: revert unintended image downgrades

Signed-off-by: Tomas Coufal <[email protected]>
opendatahub-io · Jan 22, 2025 · 85b212f · 85b212f
1 parent c923f17
commit 85b212f
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 66 deletions.
diff --git a/importer-pipeline.yaml b/importer-pipeline.yaml
@@ -38,7 +38,7 @@ deploymentSpec:
           value: /tmp
         - name: XDG_DATA_HOME
           value: /tmp
-        image: registry.redhat.io/rhelai1/instructlab-nvidia-rhel9@sha256:b3dc9af0244aa6b84e6c3ef53e714a316daaefaae67e28de397cd71ee4b2ac7e
+        image: registry.redhat.io/rhelai1/instructlab-nvidia-rhel9@sha256:05cfba1fb13ed54b1de4d021da2a31dd78ba7d8cc48e10c7fe372815899a18ae
 pipelineInfo:
   description: Helper pipeline to the InstructLab pipeline which allows users to seed/import
     a new base model

diff --git a/pipeline.yaml b/pipeline.yaml
@@ -648,7 +648,7 @@ deploymentSpec:
         env:
         - name: XDG_CACHE_HOME
           value: /tmp
-        image: registry.redhat.io/rhelai1/instructlab-nvidia-rhel9@sha256:b3dc9af0244aa6b84e6c3ef53e714a316daaefaae67e28de397cd71ee4b2ac7e
+        image: registry.redhat.io/rhelai1/instructlab-nvidia-rhel9@sha256:05cfba1fb13ed54b1de4d021da2a31dd78ba7d8cc48e10c7fe372815899a18ae
     exec-deletepvc:
       container:
         image: argostub/deletepvc
@@ -1371,7 +1371,7 @@ deploymentSpec:
           value: /tmp
         - name: JUDGE_CA_CERT_PATH
           value: /tmp/cert/ca.crt
-        image: registry.redhat.io/rhelai1/instructlab-nvidia-rhel9@sha256:b3dc9af0244aa6b84e6c3ef53e714a316daaefaae67e28de397cd71ee4b2ac7e
+        image: registry.redhat.io/rhelai1/instructlab-nvidia-rhel9@sha256:05cfba1fb13ed54b1de4d021da2a31dd78ba7d8cc48e10c7fe372815899a18ae
         resources:
           accelerator:
             count: '1'
@@ -1501,7 +1501,7 @@ deploymentSpec:
           value: /tmp
         - name: JUDGE_CA_CERT_PATH
           value: /tmp/cert/ca.crt
-        image: registry.redhat.io/rhelai1/instructlab-nvidia-rhel9@sha256:b3dc9af0244aa6b84e6c3ef53e714a316daaefaae67e28de397cd71ee4b2ac7e
+        image: registry.redhat.io/rhelai1/instructlab-nvidia-rhel9@sha256:05cfba1fb13ed54b1de4d021da2a31dd78ba7d8cc48e10c7fe372815899a18ae
         resources:
           accelerator:
             count: '1'
@@ -1614,7 +1614,7 @@ deploymentSpec:
           value: /tmp
         - name: SDG_CA_CERT_PATH
           value: /tmp/cert/ca.crt
-        image: registry.redhat.io/rhelai1/instructlab-nvidia-rhel9@sha256:b3dc9af0244aa6b84e6c3ef53e714a316daaefaae67e28de397cd71ee4b2ac7e
+        image: registry.redhat.io/rhelai1/instructlab-nvidia-rhel9@sha256:05cfba1fb13ed54b1de4d021da2a31dd78ba7d8cc48e10c7fe372815899a18ae
     exec-sdg-to-artifact-op:
       container:
         args:

diff --git a/standalone/standalone.py b/standalone/standalone.py
@@ -1132,20 +1132,27 @@ def sdg_op(
     sdg_path: str = "/data/sdg",
     sdg_sampling_size: float = 1.0,
 ):
-    from os import getenv, path
+    import os
+    import shutil
+    import tempfile
 
     import instructlab.sdg
     import openai
+    import xdg_base_dirs
     import yaml
 
-    api_key = getenv("api_key")
-    model = getenv("model")
-    endpoint = getenv("endpoint")
+    api_key = os.getenv("api_key")
+    model = os.getenv("model")
+    endpoint = os.getenv("endpoint")
 
-    if sdg_ca_cert := getenv("SDG_CA_CERT_PATH"):
+    sdg_ca_cert_path = os.getenv("SDG_CA_CERT_PATH")
+    use_tls = os.path.exists(sdg_ca_cert_path) and (
+        os.path.getsize(sdg_ca_cert_path) > 0
+    )
+    if use_tls:
         import httpx
 
-        custom_http_client = httpx.Client(verify=sdg_ca_cert)
+        custom_http_client = httpx.Client(verify=sdg_ca_cert_path)
         client = openai.OpenAI(
             base_url=endpoint, api_key=api_key, http_client=custom_http_client
         )
@@ -1184,7 +1191,7 @@ def sdg_op(
         skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml"
 
         def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
-            if path.exists(skills_recipe):
+            if os.path.exists(skills_recipe):
                 with open(skills_recipe, "r", encoding="utf-8") as file:
                     skills_yaml = yaml.load(file, Loader=yaml.Loader)
 
@@ -1200,16 +1207,11 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
         except PermissionError:
             print("Failed to set precomputed skills data ratio: Permission denied")
             print("Attempting to move default data recipes to temporary directory")
-            import os
-            import shutil
-            import tempfile
-
-            import xdg_base_dirs
 
             # Create a temporary directory
             with tempfile.TemporaryDirectory() as temp_dir:
                 # Create a default_data_recipes directory
-                temp_dir = path.join(temp_dir, "default_data_recipes")
+                temp_dir = os.path.join(temp_dir, "default_data_recipes")
                 os.mkdir(temp_dir)
 
                 # Copy default_data_recipes/skills.yaml to the temporary directory
@@ -1222,7 +1224,7 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
                     os.path.join(str(dir), "instructlab", "sdg")
                     for dir in xdg_base_dirs.xdg_data_dirs()
                 ]
-                temp_pipeline_dir = path.join(temp_dir, "pipeline")
+                temp_pipeline_dir = os.path.join(temp_dir, "pipeline")
                 os.mkdir(temp_pipeline_dir)
                 for d in data_dirs:
                     pipeline_path = os.path.join(d, "pipelines", pipeline)
@@ -1235,7 +1237,7 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
                         break
 
                 # Build new skills.yaml path
-                new_skills_recipe = path.join(temp_dir, "skills.yaml")
+                new_skills_recipe = os.path.join(temp_dir, "skills.yaml")
                 print(f"New skills recipe path: {new_skills_recipe}")
 
                 # Override XDG_DATA_DIRS with the temporary directory
@@ -1661,28 +1663,18 @@ def run_mt_bench_op(
     import os
     import subprocess
 
+    import httpx
     import torch
     from instructlab.eval.mt_bench import MTBenchEvaluator
 
-    if judge_ca_cert := os.getenv("JUDGE_CA_CERT_PATH"):
-        import httpx
-        import openai
-
-        # Create a custom HTTP client
-        class CustomHttpClient(httpx.Client):
-            def __init__(self, *args, **kwargs):
-                # Use the custom CA certificate
-                kwargs.setdefault("verify", judge_ca_cert)
-                super().__init__(*args, **kwargs)
-
-        # Create a new OpenAI class that uses the custom HTTP client
-        class CustomOpenAI(openai.OpenAI):
-            def __init__(self, *args, **kwargs):
-                custom_client = CustomHttpClient()
-                super().__init__(http_client=custom_client, *args, **kwargs)
-
-        # Monkey patch the OpenAI class in the openai module, so that the eval lib can use it
-        openai.OpenAI = CustomOpenAI
+    judge_api_key = os.getenv("JUDGE_API_KEY", "")
+    judge_model_name = os.getenv("JUDGE_NAME")
+    judge_endpoint = os.getenv("JUDGE_ENDPOINT")
+    judge_ca_cert_path = os.getenv("JUDGE_CA_CERT_PATH")
+    use_tls = os.path.exists(judge_ca_cert_path) and (
+        os.path.getsize(judge_ca_cert_path) > 0
+    )
+    judge_http_client = httpx.Client(verify=judge_ca_cert_path) if use_tls else None
 
     def launch_vllm(
         model_path: str, gpu_count: int, retries: int = 120, delay: int = 10
@@ -1775,10 +1767,6 @@ def shutdown_vllm(process: subprocess.Popen, timeout: int = 20):
 
     models_list = os.listdir(models_folder)
 
-    judge_api_key = os.getenv("JUDGE_API_KEY", "")
-    judge_model_name = os.getenv("JUDGE_NAME")
-    judge_endpoint = os.getenv("JUDGE_ENDPOINT")
-
     scores = {}
     all_mt_bench_data = []
 
@@ -1814,6 +1802,7 @@ def shutdown_vllm(process: subprocess.Popen, timeout: int = 20):
             server_url=vllm_server,
             serving_gpus=gpu_count,
             max_workers=max_workers,
+            http_client=judge_http_client,
         )
 
         shutdown_vllm(vllm_process)
@@ -1823,6 +1812,7 @@ def shutdown_vllm(process: subprocess.Popen, timeout: int = 20):
             api_key=judge_api_key,
             serving_gpus=gpu_count,
             max_workers=max_workers,
+            http_client=judge_http_client,
         )
 
         mt_bench_data = {
@@ -1884,30 +1874,20 @@ def run_final_eval_op(
     import os
     import subprocess
 
+    import httpx
     import torch
     from instructlab.eval.mmlu import MMLUBranchEvaluator
     from instructlab.eval.mt_bench import MTBenchBranchEvaluator
     from instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores, sort_score
 
-    if judge_ca_cert := os.getenv("JUDGE_CA_CERT_PATH"):
-        import httpx
-        import openai
-
-        # Create a custom HTTP client
-        class CustomHttpClient(httpx.Client):
-            def __init__(self, *args, **kwargs):
-                # Use the custom CA certificate
-                kwargs.setdefault("verify", judge_ca_cert)
-                super().__init__(*args, **kwargs)
-
-        # Create a new OpenAI class that uses the custom HTTP client
-        class CustomOpenAI(openai.OpenAI):
-            def __init__(self, *args, **kwargs):
-                custom_client = CustomHttpClient()
-                super().__init__(http_client=custom_client, *args, **kwargs)
-
-        # Monkey patch the OpenAI class in the openai module, so that the eval lib can use it
-        openai.OpenAI = CustomOpenAI
+    judge_api_key = os.getenv("JUDGE_API_KEY", "")
+    judge_model_name = os.getenv("JUDGE_NAME")
+    judge_endpoint = os.getenv("JUDGE_ENDPOINT")
+    judge_ca_cert_path = os.getenv("JUDGE_CA_CERT_PATH")
+    use_tls = os.path.exists(judge_ca_cert_path) and (
+        os.path.getsize(judge_ca_cert_path) > 0
+    )
+    judge_http_client = httpx.Client(verify=judge_ca_cert_path) if use_tls else None
 
     print("Starting Final Eval...")
 
@@ -2267,6 +2247,7 @@ def find_node_dataset_directories(base_dir: str):
             server_url=vllm_server,
             serving_gpus=gpu_count,
             max_workers=max_workers,
+            http_client=judge_http_client,
         )
 
         shutdown_vllm(vllm_process)
@@ -2277,6 +2258,7 @@ def find_node_dataset_directories(base_dir: str):
             api_key=judge_api_key,
             serving_gpus=gpu_count,
             max_workers=max_workers,
+            http_client=judge_http_client,
         )
 
         qa_pairs_and_errors.append((overall_score, qa_pairs, error_rate))

diff --git a/utils/consts.py b/utils/consts.py
@@ -1,4 +1,4 @@
-PYTHON_IMAGE = "quay.io/modh/odh-generic-data-science-notebook@sha256:0efbb3ad6f8f342360cf1f002d40716a39d4c58f69163e053d5bd19b4fe732d4"
-TOOLBOX_IMAGE = "registry.redhat.io/ubi9/toolbox@sha256:da31dee8904a535d12689346e65e5b00d11a6179abf1fa69b548dbd755fa2770"
-OC_IMAGE = "registry.redhat.io/openshift4/ose-cli@sha256:1d5c8442a6ec745e6ae44a7738c0681f1e21aac8be76ba826c2ddf2eed8475db"
-RHELAI_IMAGE = "registry.redhat.io/rhelai1/instructlab-nvidia-rhel9@sha256:b3dc9af0244aa6b84e6c3ef53e714a316daaefaae67e28de397cd71ee4b2ac7e"
+PYTHON_IMAGE = "quay.io/modh/odh-generic-data-science-notebook@sha256:0efbb3ad6f8f342360cf1f002d40716a39d4c58f69163e053d5bd19b4fe732d4"  # v3-2024b-20250115
+TOOLBOX_IMAGE = "registry.redhat.io/ubi9/toolbox@sha256:da31dee8904a535d12689346e65e5b00d11a6179abf1fa69b548dbd755fa2770"  # v9.5
+OC_IMAGE = "registry.redhat.io/openshift4/ose-cli@sha256:08bdbfae224dd39c81689ee73c183619d6b41eba7ac04f0dce7ee79f50531d0b"  # v4.15.0
+RHELAI_IMAGE = "registry.redhat.io/rhelai1/instructlab-nvidia-rhel9@sha256:05cfba1fb13ed54b1de4d021da2a31dd78ba7d8cc48e10c7fe372815899a18ae"  # v1.3.2