Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: remove phase mocking and faked packages #246

Merged
merged 1 commit into from
Jan 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions eval/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .final import run_final_eval_op
from .mt_bench import run_mt_bench_op

__all__ = ["run_final_eval_op", "run_mt_bench_op"]
2 changes: 1 addition & 1 deletion eval/final/components.py → eval/final.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# type: ignore
# pylint: disable=no-value-for-parameter,import-outside-toplevel,import-error
# pylint: disable=import-outside-toplevel,import-error

from kfp.dsl import Artifact, Output, component

Expand Down
5 changes: 0 additions & 5 deletions eval/final/__init__.py

This file was deleted.

3 changes: 2 additions & 1 deletion eval/mt_bench/components.py → eval/mt_bench.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# type: ignore
# pylint: disable=no-value-for-parameter,import-outside-toplevel,import-error
# pylint: disable=import-outside-toplevel,import-error

from typing import NamedTuple, Optional

from kfp.dsl import component
Expand Down
5 changes: 0 additions & 5 deletions eval/mt_bench/__init__.py

This file was deleted.

879 changes: 413 additions & 466 deletions pipeline.py

Large diffs are not rendered by default.

83 changes: 41 additions & 42 deletions pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1528,24 +1528,24 @@ deploymentSpec:
\ *\n\ndef sdg_op(\n num_instructions_to_generate: int,\n pipeline:\
\ str,\n repo_branch: Optional[str],\n repo_pr: Optional[int],\n \
\ taxonomy_path: str = \"/data/taxonomy\",\n sdg_path: str = \"/data/sdg\"\
,\n sdg_sampling_size: float = 1.0,\n):\n import os\n from os import\
\ getenv, path\n\n import instructlab.sdg\n import openai\n import\
\ yaml\n\n api_key = getenv(\"api_key\")\n model = getenv(\"model\"\
)\n endpoint = getenv(\"endpoint\")\n\n sdg_ca_cert_path = getenv(\"\
SDG_CA_CERT_PATH\")\n use_tls = os.path.exists(sdg_ca_cert_path) and\
\ (\n os.path.getsize(sdg_ca_cert_path) > 0\n )\n if use_tls:\n\
\ import httpx\n\n custom_http_client = httpx.Client(verify=sdg_ca_cert_path)\n\
\ client = openai.OpenAI(\n base_url=endpoint, api_key=api_key,\
\ http_client=custom_http_client\n )\n else:\n client =\
\ openai.OpenAI(base_url=endpoint, api_key=api_key)\n\n taxonomy_base\
\ = \"main\" if repo_branch or (repo_pr and int(repo_pr) > 0) else \"empty\"\
\n\n print(\"Generating synthetic dataset for:\")\n print()\n print(\n\
\ instructlab.sdg.utils.taxonomy.read_taxonomy(\n taxonomy_path,\
\ taxonomy_base, document_output_dir=f\"{sdg_path}/documents\"\n \
\ )\n )\n\n # Generate synthetic dataset\n # 1.0 is the default\
\ size\n if sdg_sampling_size == 1.0:\n # generate_data has a\
\ magic word for its taxonomy_base argument - 'empty'\n # it allows\
\ generating from the whole repo, see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
,\n sdg_sampling_size: float = 1.0,\n):\n import os\n import shutil\n\
\ import tempfile\n\n import instructlab.sdg\n import openai\n\
\ import xdg_base_dirs\n import yaml\n\n api_key = os.getenv(\"\
api_key\")\n model = os.getenv(\"model\")\n endpoint = os.getenv(\"\
endpoint\")\n\n sdg_ca_cert_path = os.getenv(\"SDG_CA_CERT_PATH\")\n\
\ use_tls = os.path.exists(sdg_ca_cert_path) and (\n os.path.getsize(sdg_ca_cert_path)\
\ > 0\n )\n if use_tls:\n import httpx\n\n custom_http_client\
\ = httpx.Client(verify=sdg_ca_cert_path)\n client = openai.OpenAI(\n\
\ base_url=endpoint, api_key=api_key, http_client=custom_http_client\n\
\ )\n else:\n client = openai.OpenAI(base_url=endpoint,\
\ api_key=api_key)\n\n taxonomy_base = \"main\" if repo_branch or (repo_pr\
\ and int(repo_pr) > 0) else \"empty\"\n\n print(\"Generating synthetic\
\ dataset for:\")\n print()\n print(\n instructlab.sdg.utils.taxonomy.read_taxonomy(\n\
\ taxonomy_path, taxonomy_base, document_output_dir=f\"{sdg_path}/documents\"\
\n )\n )\n\n # Generate synthetic dataset\n # 1.0 is the\
\ default size\n if sdg_sampling_size == 1.0:\n # generate_data\
\ has a magic word for its taxonomy_base argument - 'empty'\n # it\
\ allows generating from the whole repo, see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
\ instructlab.sdg.generate_data(\n client=client,\n \
\ num_instructions_to_generate=num_instructions_to_generate,\n\
\ output_dir=sdg_path,\n taxonomy=taxonomy_path,\n\
Expand All @@ -1554,40 +1554,39 @@ deploymentSpec:
\ server_ctx_size=4096,\n )\n # Tweak precomputed skills\
\ data ratio if needed\n else:\n skills_recipe = \"/usr/share/instructlab/sdg/default_data_recipes/skills.yaml\"\
\n\n def set_precomputed_skills_data_ratio(sampling_size: float,\
\ skills_recipe: str):\n if path.exists(skills_recipe):\n \
\ with open(skills_recipe, \"r\", encoding=\"utf-8\") as file:\n\
\ skills_yaml = yaml.load(file, Loader=yaml.Loader)\n\
\ skills_recipe: str):\n if os.path.exists(skills_recipe):\n\
\ with open(skills_recipe, \"r\", encoding=\"utf-8\") as\
\ file:\n skills_yaml = yaml.load(file, Loader=yaml.Loader)\n\
\n skills_yaml[\"datasets\"][0][\"sampling_size\"] = sampling_size\n\
\n with open(skills_recipe, \"w\", encoding=\"utf-8\") as\
\ file:\n yaml.dump(skills_yaml, file)\n\n try:\n\
\ set_precomputed_skills_data_ratio(\n sampling_size=sdg_sampling_size,\
\ skills_recipe=skills_recipe\n )\n except PermissionError:\n\
\ print(\"Failed to set precomputed skills data ratio: Permission\
\ denied\")\n print(\"Attempting to move default data recipes\
\ to temporary directory\")\n import os\n import shutil\n\
\ import tempfile\n\n import xdg_base_dirs\n\n \
\ # Create a temporary directory\n with tempfile.TemporaryDirectory()\
\ as temp_dir:\n # Create a default_data_recipes directory\n\
\ temp_dir = path.join(temp_dir, \"default_data_recipes\"\
)\n os.mkdir(temp_dir)\n\n # Copy default_data_recipes/skills.yaml\
\ to the temporary directory\n shutil.copy(skills_recipe,\
\ temp_dir)\n\n # Also copy the current pipeline directory\
\ to the temporary directory - it's a small\n # directory\
\ like 28KB\n # This isn't needed if the pipeline is either\
\ \"full\" or \"simple\" but it's future-proofing\n data_dirs\
\ = [\n os.path.join(str(dir), \"instructlab\", \"sdg\"\
)\n for dir in xdg_base_dirs.xdg_data_dirs()\n \
\ ]\n temp_pipeline_dir = path.join(temp_dir, \"\
pipeline\")\n os.mkdir(temp_pipeline_dir)\n \
\ for d in data_dirs:\n pipeline_path = os.path.join(d,\
\ \"pipelines\", pipeline)\n if os.path.exists(pipeline_path):\n\
\ shutil.copytree(\n pipeline_path,\n\
\ to temporary directory\")\n\n # Create a temporary directory\n\
\ with tempfile.TemporaryDirectory() as temp_dir:\n \
\ # Create a default_data_recipes directory\n temp_dir\
\ = os.path.join(temp_dir, \"default_data_recipes\")\n os.mkdir(temp_dir)\n\
\n # Copy default_data_recipes/skills.yaml to the temporary\
\ directory\n shutil.copy(skills_recipe, temp_dir)\n\n \
\ # Also copy the current pipeline directory to the temporary\
\ directory - it's a small\n # directory like 28KB\n \
\ # This isn't needed if the pipeline is either \"full\" or \"\
simple\" but it's future-proofing\n data_dirs = [\n \
\ os.path.join(str(dir), \"instructlab\", \"sdg\")\n \
\ for dir in xdg_base_dirs.xdg_data_dirs()\n \
\ ]\n temp_pipeline_dir = os.path.join(temp_dir, \"pipeline\"\
)\n os.mkdir(temp_pipeline_dir)\n for d in\
\ data_dirs:\n pipeline_path = os.path.join(d, \"pipelines\"\
, pipeline)\n if os.path.exists(pipeline_path):\n \
\ shutil.copytree(\n pipeline_path,\n\
\ temp_pipeline_dir,\n \
\ dirs_exist_ok=True,\n )\n \
\ break\n\n # Build new skills.yaml path\n \
\ new_skills_recipe = path.join(temp_dir, \"skills.yaml\")\n \
\ print(f\"New skills recipe path: {new_skills_recipe}\")\n\n\
\ # Override XDG_DATA_DIRS with the temporary directory\n\
\ new_skills_recipe = os.path.join(temp_dir, \"skills.yaml\")\n\
\ print(f\"New skills recipe path: {new_skills_recipe}\"\
)\n\n # Override XDG_DATA_DIRS with the temporary directory\n\
\ # This allows SDG to read the new skills.yaml since it's\
\ looking into XDG_DATA_DIRS\n # and looks for a default_data_recipes\
\ directory with a skills.yaml file\n os.environ[\"XDG_DATA_DIRS\"\
Expand Down
2 changes: 0 additions & 2 deletions sdg/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from . import faked
from .components import (
git_clone_op,
sdg_op,
Expand All @@ -11,5 +10,4 @@
"sdg_op",
"taxonomy_to_artifact_op",
"sdg_to_artifact_op",
"faked",
]
27 changes: 12 additions & 15 deletions sdg/components.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# type: ignore
# pylint: disable=no-value-for-parameter,import-outside-toplevel,import-error,no-member
# pylint: disable=import-outside-toplevel,import-error
from typing import Optional

from kfp import dsl
Expand Down Expand Up @@ -38,17 +38,19 @@ def sdg_op(
sdg_sampling_size: float = 1.0,
):
import os
from os import getenv, path
import shutil
import tempfile

import instructlab.sdg
import openai
import xdg_base_dirs
import yaml

api_key = getenv("api_key")
model = getenv("model")
endpoint = getenv("endpoint")
api_key = os.getenv("api_key")
model = os.getenv("model")
endpoint = os.getenv("endpoint")

sdg_ca_cert_path = getenv("SDG_CA_CERT_PATH")
sdg_ca_cert_path = os.getenv("SDG_CA_CERT_PATH")
use_tls = os.path.exists(sdg_ca_cert_path) and (
os.path.getsize(sdg_ca_cert_path) > 0
)
Expand Down Expand Up @@ -94,7 +96,7 @@ def sdg_op(
skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml"

def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
if path.exists(skills_recipe):
if os.path.exists(skills_recipe):
with open(skills_recipe, "r", encoding="utf-8") as file:
skills_yaml = yaml.load(file, Loader=yaml.Loader)

Expand All @@ -110,16 +112,11 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
except PermissionError:
print("Failed to set precomputed skills data ratio: Permission denied")
print("Attempting to move default data recipes to temporary directory")
import os
import shutil
import tempfile

import xdg_base_dirs

# Create a temporary directory
with tempfile.TemporaryDirectory() as temp_dir:
# Create a default_data_recipes directory
temp_dir = path.join(temp_dir, "default_data_recipes")
temp_dir = os.path.join(temp_dir, "default_data_recipes")
os.mkdir(temp_dir)

# Copy default_data_recipes/skills.yaml to the temporary directory
Expand All @@ -132,7 +129,7 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
os.path.join(str(dir), "instructlab", "sdg")
for dir in xdg_base_dirs.xdg_data_dirs()
]
temp_pipeline_dir = path.join(temp_dir, "pipeline")
temp_pipeline_dir = os.path.join(temp_dir, "pipeline")
os.mkdir(temp_pipeline_dir)
for d in data_dirs:
pipeline_path = os.path.join(d, "pipelines", pipeline)
Expand All @@ -145,7 +142,7 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
break

# Build new skills.yaml path
new_skills_recipe = path.join(temp_dir, "skills.yaml")
new_skills_recipe = os.path.join(temp_dir, "skills.yaml")
print(f"New skills recipe path: {new_skills_recipe}")

# Override XDG_DATA_DIRS with the temporary directory
Expand Down
8 changes: 0 additions & 8 deletions sdg/faked/__init__.py

This file was deleted.

68 changes: 0 additions & 68 deletions sdg/faked/components.py

This file was deleted.

Loading
Loading