Skip to content

Commit

Permalink
update sdg generated line in knowledge-yaml
Browse files Browse the repository at this point in the history
Signed-off-by: sallyom <[email protected]>
  • Loading branch information
sallyom committed Oct 11, 2024
1 parent 8122f5b commit adaf720
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 12 deletions.
33 changes: 33 additions & 0 deletions eval/final/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,34 @@ def branch_eval_summary_to_json(

# MMLU_BRANCH

# This is very specific to `ilab generate`, necessary because the data generation and
# model evaluation are taking place in separate environments.
def update_test_lines_in_files(tasks_dir):
# Define the regex to match lines starting with any indentation, 'test:', and containing 'node_datasets_*'
regex = re.compile(r"(\s*test:\s*).*/(node_datasets_[^/]*)(.*)")

for root, dirs, files in os.walk(tasks_dir):
for file_name in files:
file_path = os.path.join(root, file_name)

with open(file_path, "r") as file:
lines = file.readlines()

updated_lines = []
changed = False

for line in lines:
# Replace the matched line with the desired format, keeping 'test:' and leading whitespace intact
new_line = re.sub(regex, rf"\1{tasks_dir}/\2\3", line)
if new_line != line:
changed = True # Only rewrite the file if there's a change
updated_lines.append(new_line)

if changed:
with open(file_path, "w") as file:
file.writelines(updated_lines)
print(f"Updated: {file_path}")

# find_node_dataset_directories to find sdg output node_datasets_*
def find_node_dataset_directories(base_directory: str):
import os
Expand All @@ -223,6 +251,11 @@ def find_node_dataset_directories(base_directory: str):
# generates a node_datasets_ directory for MMLU custom tasks data
if node_dataset_dirs:
tasks_dir = node_dataset_dirs[0]
# From `ilab sdg` the knowledge_*_task.yaml files have a line that references where the SDG took place.
# This needs to be updated to run elsewhere.
# The line is:
# test: /path/to/where/sdg/occured/node_datasets_*
update_test_lines_in_files(tasks_dir)

mmlu_branch_evaluators = [
MMLUBranchEvaluator(
Expand Down
45 changes: 33 additions & 12 deletions pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1103,18 +1103,39 @@ deploymentSpec:
\ gpu_available\n else \"No GPU available\"\n )\n gpu_count\
\ = torch.cuda.device_count() if gpu_available else 0\n\n print(f\"GPU\
\ Available: {gpu_available}, Using: {gpu_name}\")\n\n # MMLU_BRANCH\n\
\n # find_node_dataset_directories to find sdg output node_datasets_*\n\
\ def find_node_dataset_directories(base_directory: str):\n import\
\ os\n import re\n\n # This is specific to ilab/eval output\n\
\ pattern = r\"node_datasets_\"\n matching_dirs = []\n \
\ regex = re.compile(pattern)\n\n for root, dirs, files in os.walk(base_directory):\n\
\ for directory in dirs:\n if regex.search(directory):\n\
\ matching_dirs.append(os.path.join(root, directory))\n\
\n return matching_dirs\n\n mmlu_tasks = [\"mmlu_pr\"]\n\n \
\ node_dataset_dirs = find_node_dataset_directories(tasks.path)\n # This\
\ assumes generated filesystem from ilab sdg, which\n # generates a node_datasets_\
\ directory for MMLU custom tasks data\n if node_dataset_dirs:\n \
\ tasks_dir = node_dataset_dirs[0]\n\n mmlu_branch_evaluators\
\n # This is very specific to `ilab generate`, necessary because the\
\ data generation and\n # model evaluation are taking place in separate\
\ environments.\n def update_test_lines_in_files(tasks_dir):\n \
\ # Define the regex to match lines starting with any indentation, 'test:',\
\ and containing 'node_datasets_*'\n regex = re.compile(r\"(\\s*test:\\\
s*).*/(node_datasets_[^/]*)(.*)\")\n\n for root, dirs, files in os.walk(tasks_dir):\n\
\ for file_name in files:\n file_path = os.path.join(root,\
\ file_name)\n\n with open(file_path, \"r\") as file:\n \
\ lines = file.readlines()\n\n updated_lines\
\ = []\n changed = False\n\n for line in lines:\n\
\ # Replace the matched line with the desired format,\
\ keeping 'test:' and leading whitespace intact\n new_line\
\ = re.sub(regex, rf\"\\1{tasks_dir}/\\2\\3\", line)\n \
\ if new_line != line:\n changed = True # Only\
\ rewrite the file if there's a change\n updated_lines.append(new_line)\n\
\n if changed:\n with open(file_path,\
\ \"w\") as file:\n file.writelines(updated_lines)\n\
\ print(f\"Updated: {file_path}\")\n\n # find_node_dataset_directories\
\ to find sdg output node_datasets_*\n def find_node_dataset_directories(base_directory:\
\ str):\n import os\n import re\n\n # This is specific\
\ to ilab/eval output\n pattern = r\"node_datasets_\"\n matching_dirs\
\ = []\n regex = re.compile(pattern)\n\n for root, dirs, files\
\ in os.walk(base_directory):\n for directory in dirs:\n \
\ if regex.search(directory):\n matching_dirs.append(os.path.join(root,\
\ directory))\n\n return matching_dirs\n\n mmlu_tasks = [\"mmlu_pr\"\
]\n\n node_dataset_dirs = find_node_dataset_directories(tasks.path)\n\
\ # This assumes generated filesystem from ilab sdg, which\n # generates\
\ a node_datasets_ directory for MMLU custom tasks data\n if node_dataset_dirs:\n\
\ tasks_dir = node_dataset_dirs[0]\n # From `ilab sdg` the\
\ knowledge_*_task.yaml files have a line that references where the SDG\
\ took place.\n # This needs to be updated to run elsewhere.\n \
\ # The line is:\n # test: /path/to/where/sdg/occured/node_datasets_*\n\
\ update_test_lines_in_files(tasks_dir)\n\n mmlu_branch_evaluators\
\ = [\n MMLUBranchEvaluator(\n model_path=candidate_model,\n\
\ tasks_dir=tasks_dir,\n tasks=mmlu_tasks,\n\
\ few_shots=few_shots,\n batch_size=batch_size,\n\
Expand Down
33 changes: 33 additions & 0 deletions standalone/standalone.py
Original file line number Diff line number Diff line change
Expand Up @@ -1767,6 +1767,34 @@ def branch_eval_summary_to_json(
# MMLU_BRANCH
# This is very specific to `ilab generate`, necessary because the data generation and
# model evaluation are taking place in separate environments.
def update_test_lines_in_files(tasks_dir):
# Define the regex to match lines starting with any indentation, 'test:', and containing 'node_datasets_*'
regex = re.compile(r"(\s*test:\s*).*/(node_datasets_[^/]*)(.*)")
for root, dirs, files in os.walk(tasks_dir):
for file_name in files:
file_path = os.path.join(root, file_name)
with open(file_path, "r") as file:
lines = file.readlines()
updated_lines = []
changed = False
for line in lines:
# Replace the matched line with the desired format, keeping 'test:' and leading whitespace intact
new_line = re.sub(regex, rf"\1{tasks_dir}/\2\3", line)
if new_line != line:
changed = True # Only rewrite the file if there's a change
updated_lines.append(new_line)
if changed:
with open(file_path, "w") as file:
file.writelines(updated_lines)
print(f"Updated: {file_path}")
# find_node_dataset_directories to find sdg output node_datasets_*
def find_node_dataset_directories(base_directory: str):
import os
Expand All @@ -1791,6 +1819,11 @@ def find_node_dataset_directories(base_directory: str):
# generates a node_datasets_ directory for MMLU custom tasks data
if node_dataset_dirs:
tasks_dir = node_dataset_dirs[0]
# From `ilab sdg` the knowledge_*_task.yaml files have a line that references where the SDG took place.
# This needs to be updated to run elsewhere.
# The line is:
# test: /path/to/where/sdg/occured/node_datasets_*
update_test_lines_in_files(tasks_dir)
mmlu_branch_evaluators = [
MMLUBranchEvaluator(
Expand Down

0 comments on commit adaf720

Please sign in to comment.